From d622b66a820a0e5e61c131e9ae5b4db35292aa14 Mon Sep 17 00:00:00 2001 From: josel-amd Date: Thu, 2 Jan 2025 14:58:15 +0100 Subject: [PATCH 001/480] Re-introduce Type Conversion on EmitC (#121476) This PR reintroduces https://github.com/llvm/llvm-project/pull/118940 with a fix for the build issues on cd9caf3aeed55280537052227f08bb1b41154efd --- .../mlir/Conversion/SCFToEmitC/SCFToEmitC.h | 4 +- mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt | 1 + mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp | 206 ++++++++++++------ mlir/test/Conversion/SCFToEmitC/for.mlir | 89 +++++++- mlir/test/Conversion/SCFToEmitC/switch.mlir | 9 +- 5 files changed, 229 insertions(+), 80 deletions(-) diff --git a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h index 22df7f1c5dcf2..acc39e6acf726 100644 --- a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h +++ b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h @@ -9,6 +9,7 @@ #ifndef MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H #define MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H +#include "mlir/Transforms/DialectConversion.h" #include namespace mlir { @@ -19,7 +20,8 @@ class RewritePatternSet; #include "mlir/Conversion/Passes.h.inc" /// Collect a set of patterns to convert SCF operations to the EmitC dialect. -void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns); +void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter); } // namespace mlir #endif // MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H diff --git a/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt b/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt index 79119d374f7a5..af5493be8a4b3 100644 --- a/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt +++ b/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIRSCFToEmitC LINK_LIBS PUBLIC MLIRArithDialect MLIREmitCDialect + MLIREmitCTransforms MLIRSCFDialect MLIRTransforms ) diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp index 67a43c43d608b..92523ca4f12b2 100644 --- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp +++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" +#include "mlir/Dialect/EmitC/Transforms/TypeConversions.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" @@ -39,21 +40,22 @@ struct SCFToEmitCPass : public impl::SCFToEmitCBase { // Lower scf::for to emitc::for, implementing result values using // emitc::variable's updated within the loop body. -struct ForLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct ForLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(ForOp forOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(ForOp forOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; // Create an uninitialized emitc::variable op for each result of the given op. template -static SmallVector createVariablesForResults(T op, - PatternRewriter &rewriter) { - SmallVector resultVariables; - +static LogicalResult +createVariablesForResults(T op, const TypeConverter *typeConverter, + ConversionPatternRewriter &rewriter, + SmallVector &resultVariables) { if (!op.getNumResults()) - return resultVariables; + return success(); Location loc = op->getLoc(); MLIRContext *context = op.getContext(); @@ -62,7 +64,9 @@ static SmallVector createVariablesForResults(T op, rewriter.setInsertionPoint(op); for (OpResult result : op.getResults()) { - Type resultType = result.getType(); + Type resultType = typeConverter->convertType(result.getType()); + if (!resultType) + return rewriter.notifyMatchFailure(op, "result type conversion failed"); Type varType = emitc::LValueType::get(resultType); emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, ""); emitc::VariableOp var = @@ -70,13 +74,13 @@ static SmallVector createVariablesForResults(T op, resultVariables.push_back(var); } - return resultVariables; + return success(); } // Create a series of assign ops assigning given values to given variables at // the current insertion point of given rewriter. -static void assignValues(ValueRange values, SmallVector &variables, - PatternRewriter &rewriter, Location loc) { +static void assignValues(ValueRange values, ValueRange variables, + ConversionPatternRewriter &rewriter, Location loc) { for (auto [value, var] : llvm::zip(values, variables)) rewriter.create(loc, var, value); } @@ -89,18 +93,25 @@ SmallVector loadValues(const SmallVector &variables, }); } -static void lowerYield(SmallVector &resultVariables, - PatternRewriter &rewriter, scf::YieldOp yield) { +static LogicalResult lowerYield(Operation *op, ValueRange resultVariables, + ConversionPatternRewriter &rewriter, + scf::YieldOp yield) { Location loc = yield.getLoc(); - ValueRange operands = yield.getOperands(); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(yield); - assignValues(operands, resultVariables, rewriter, loc); + SmallVector yieldOperands; + if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands))) { + return rewriter.notifyMatchFailure(op, "failed to lower yield operands"); + } + + assignValues(yieldOperands, resultVariables, rewriter, loc); rewriter.create(loc); rewriter.eraseOp(yield); + + return success(); } // Lower the contents of an scf::if/scf::index_switch regions to an @@ -108,27 +119,32 @@ static void lowerYield(SmallVector &resultVariables, // moved into the respective lowered region, but the scf::yield is replaced not // only with an emitc::yield, but also with a sequence of emitc::assign ops that // set the yielded values into the result variables. -static void lowerRegion(SmallVector &resultVariables, - PatternRewriter &rewriter, Region ®ion, - Region &loweredRegion) { +static LogicalResult lowerRegion(Operation *op, ValueRange resultVariables, + ConversionPatternRewriter &rewriter, + Region ®ion, Region &loweredRegion) { rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); Operation *terminator = loweredRegion.back().getTerminator(); - lowerYield(resultVariables, rewriter, cast(terminator)); + return lowerYield(op, resultVariables, rewriter, + cast(terminator)); } -LogicalResult ForLowering::matchAndRewrite(ForOp forOp, - PatternRewriter &rewriter) const { +LogicalResult +ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = forOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the loop body. - SmallVector resultVariables = - createVariablesForResults(forOp, rewriter); + SmallVector resultVariables; + if (failed(createVariablesForResults(forOp, getTypeConverter(), rewriter, + resultVariables))) + return rewriter.notifyMatchFailure(forOp, + "create variables for results failed"); - assignValues(forOp.getInits(), resultVariables, rewriter, loc); + assignValues(adaptor.getInitArgs(), resultVariables, rewriter, loc); emitc::ForOp loweredFor = rewriter.create( - loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()); + loc, adaptor.getLowerBound(), adaptor.getUpperBound(), adaptor.getStep()); Block *loweredBody = loweredFor.getBody(); @@ -143,13 +159,27 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp, rewriter.restoreInsertionPoint(ip); + // Convert the original region types into the new types by adding unrealized + // casts in the beginning of the loop. This performs the conversion in place. + if (failed(rewriter.convertRegionTypes(&forOp.getRegion(), + *getTypeConverter(), nullptr))) { + return rewriter.notifyMatchFailure(forOp, "region types conversion failed"); + } + + // Register the replacements for the block arguments and inline the body of + // the scf.for loop into the body of the emitc::for loop. + Block *scfBody = &(forOp.getRegion().front()); SmallVector replacingValues; replacingValues.push_back(loweredFor.getInductionVar()); replacingValues.append(iterArgsValues.begin(), iterArgsValues.end()); + rewriter.mergeBlocks(scfBody, loweredBody, replacingValues); - rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues); - lowerYield(resultVariables, rewriter, - cast(loweredBody->getTerminator())); + auto result = lowerYield(forOp, resultVariables, rewriter, + cast(loweredBody->getTerminator())); + + if (failed(result)) { + return result; + } // Load variables into SSA values after the for loop. SmallVector resultValues = loadValues(resultVariables, rewriter, loc); @@ -160,38 +190,66 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp, // Lower scf::if to emitc::if, implementing result values as emitc::variable's // updated within the then and else regions. -struct IfLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct IfLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(IfOp ifOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; } // namespace -LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, - PatternRewriter &rewriter) const { +LogicalResult +IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = ifOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the then & else regions. - SmallVector resultVariables = - createVariablesForResults(ifOp, rewriter); - - Region &thenRegion = ifOp.getThenRegion(); - Region &elseRegion = ifOp.getElseRegion(); + SmallVector resultVariables; + if (failed(createVariablesForResults(ifOp, getTypeConverter(), rewriter, + resultVariables))) + return rewriter.notifyMatchFailure(ifOp, + "create variables for results failed"); + + // Utility function to lower the contents of an scf::if region to an emitc::if + // region. The contents of the scf::if regions is moved into the respective + // emitc::if regions, but the scf::yield is replaced not only with an + // emitc::yield, but also with a sequence of emitc::assign ops that set the + // yielded values into the result variables. + auto lowerRegion = [&resultVariables, &rewriter, + &ifOp](Region ®ion, Region &loweredRegion) { + rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end()); + Operation *terminator = loweredRegion.back().getTerminator(); + auto result = lowerYield(ifOp, resultVariables, rewriter, + cast(terminator)); + if (failed(result)) { + return result; + } + return success(); + }; + + Region &thenRegion = adaptor.getThenRegion(); + Region &elseRegion = adaptor.getElseRegion(); bool hasElseBlock = !elseRegion.empty(); auto loweredIf = - rewriter.create(loc, ifOp.getCondition(), false, false); + rewriter.create(loc, adaptor.getCondition(), false, false); Region &loweredThenRegion = loweredIf.getThenRegion(); - lowerRegion(resultVariables, rewriter, thenRegion, loweredThenRegion); + auto result = lowerRegion(thenRegion, loweredThenRegion); + if (failed(result)) { + return result; + } if (hasElseBlock) { Region &loweredElseRegion = loweredIf.getElseRegion(); - lowerRegion(resultVariables, rewriter, elseRegion, loweredElseRegion); + auto result = lowerRegion(elseRegion, loweredElseRegion); + if (failed(result)) { + return result; + } } rewriter.setInsertionPointAfter(ifOp); @@ -203,37 +261,46 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp, // Lower scf::index_switch to emitc::switch, implementing result values as // emitc::variable's updated within the case and default regions. -struct IndexSwitchOpLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct IndexSwitchOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; - LogicalResult matchAndRewrite(IndexSwitchOp indexSwitchOp, - PatternRewriter &rewriter) const override; + LogicalResult + matchAndRewrite(IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; }; -LogicalResult -IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, - PatternRewriter &rewriter) const { +LogicalResult IndexSwitchOpLowering::matchAndRewrite( + IndexSwitchOp indexSwitchOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { Location loc = indexSwitchOp.getLoc(); // Create an emitc::variable op for each result. These variables will be // assigned to by emitc::assign ops within the case and default regions. - SmallVector resultVariables = - createVariablesForResults(indexSwitchOp, rewriter); + SmallVector resultVariables; + if (failed(createVariablesForResults(indexSwitchOp, getTypeConverter(), + rewriter, resultVariables))) { + return rewriter.notifyMatchFailure(indexSwitchOp, + "create variables for results failed"); + } auto loweredSwitch = rewriter.create( - loc, indexSwitchOp.getArg(), indexSwitchOp.getCases(), - indexSwitchOp.getNumCases()); + loc, adaptor.getArg(), adaptor.getCases(), indexSwitchOp.getNumCases()); // Lowering all case regions. - for (auto pair : llvm::zip(indexSwitchOp.getCaseRegions(), - loweredSwitch.getCaseRegions())) { - lowerRegion(resultVariables, rewriter, std::get<0>(pair), - std::get<1>(pair)); + for (auto pair : + llvm::zip(adaptor.getCaseRegions(), loweredSwitch.getCaseRegions())) { + if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, + *std::get<0>(pair), std::get<1>(pair)))) { + return failure(); + } } // Lowering default region. - lowerRegion(resultVariables, rewriter, indexSwitchOp.getDefaultRegion(), - loweredSwitch.getDefaultRegion()); + if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter, + adaptor.getDefaultRegion(), + loweredSwitch.getDefaultRegion()))) { + return failure(); + } rewriter.setInsertionPointAfter(indexSwitchOp); SmallVector results = loadValues(resultVariables, rewriter, loc); @@ -242,15 +309,22 @@ IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp, return success(); } -void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); - patterns.add(patterns.getContext()); - patterns.add(patterns.getContext()); +void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns, + TypeConverter &typeConverter) { + patterns.add(typeConverter, patterns.getContext()); + patterns.add(typeConverter, patterns.getContext()); + patterns.add(typeConverter, patterns.getContext()); } void SCFToEmitCPass::runOnOperation() { RewritePatternSet patterns(&getContext()); - populateSCFToEmitCConversionPatterns(patterns); + TypeConverter typeConverter; + // Fallback converter + // See note https://mlir.llvm.org/docs/DialectConversion/#type-converter + // Type converters are called most to least recently inserted + typeConverter.addConversion([](Type t) { return t; }); + populateEmitCSizeTTypeConversions(typeConverter); + populateSCFToEmitCConversionPatterns(patterns, typeConverter); // Configure conversion to lower out SCF operations. ConversionTarget target(getContext()); diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir index 83592187a9b68..7f41e636936b8 100644 --- a/mlir/test/Conversion/SCFToEmitC/for.mlir +++ b/mlir/test/Conversion/SCFToEmitC/for.mlir @@ -7,8 +7,11 @@ func.func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_for_loop( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: return @@ -24,10 +27,13 @@ func.func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) { return } // CHECK-LABEL: func.func @simple_std_2_for_loops( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) { -// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK-NEXT: emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK-NEXT: } // CHECK-NEXT: } @@ -44,14 +50,17 @@ func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32) return %result#0, %result#1 : f32, f32 } // CHECK-LABEL: func.func @for_yield( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> (f32, f32) { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> (f32, f32) { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : // CHECK-NEXT: emitc.assign %[[VAL_4]] : f32 to %[[VAL_6]] : -// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_8:.*]] = emitc.load %[[VAL_5]] : // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_6]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_9]] : f32 @@ -75,15 +84,18 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 return %r : f32 } // CHECK-LABEL: func.func @nested_for_yield( -// CHECK-SAME: %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 { +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> f32 { +// CHECK-NEXT: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK-NEXT: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK-NEXT: %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_3]] : f32 to %[[VAL_4]] : -// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_6:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[VAL_6]] : f32 to %[[VAL_7]] : -// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] { +// CHECK-NEXT: emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t { // CHECK-NEXT: %[[VAL_9:.*]] = emitc.load %[[VAL_7]] : // CHECK-NEXT: %[[VAL_10:.*]] = arith.addf %[[VAL_9]], %[[VAL_9]] : f32 // CHECK-NEXT: emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : @@ -94,3 +106,60 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 // CHECK-NEXT: %[[VAL_12:.*]] = emitc.load %[[VAL_4]] : // CHECK-NEXT: return %[[VAL_12]] : f32 // CHECK-NEXT: } + +func.func @for_yield_index(%arg0 : index, %arg1 : index, %arg2 : index) -> index { + %zero = arith.constant 0 : index + %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { + scf.yield %acc : index + } + return %r : index +} + +// CHECK-LABEL: func.func @for_yield_index( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t +// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue +// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: emitc.for %[[VAL_5:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { +// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : +// CHECK: emitc.assign %[[V]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: } +// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index +// CHECK: return %[[VAL_8]] : index +// CHECK: } + + +func.func @for_yield_update_loop_carried_var(%arg0 : index, %arg1 : index, %arg2 : index) -> index { + %zero = arith.constant 0 : index + %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index { + %sn = arith.addi %acc, %acc : index + scf.yield %sn: index + } + return %r : index + } + +// CHECK-LABEL: func.func @for_yield_update_loop_carried_var( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t +// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t +// CHECK: %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t +// CHECK: %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue +// CHECK: emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: emitc.for %[[ARG_3:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t { +// CHECK: %[[V:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_5:.*]] = builtin.unrealized_conversion_cast %[[V]] : !emitc.size_t to index +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_5]] : index +// CHECK: %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : index to !emitc.size_t +// CHECK: emitc.assign %[[VAL_8]] : !emitc.size_t to %[[VAL_4]] : +// CHECK: } +// CHECK: %[[V2:.*]] = emitc.load %[[VAL_4]] : +// CHECK: %[[VAL_9:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index +// CHECK: return %[[VAL_9]] : index +// CHECK: } diff --git a/mlir/test/Conversion/SCFToEmitC/switch.mlir b/mlir/test/Conversion/SCFToEmitC/switch.mlir index 86d96ed21f1b5..61015b0ae483b 100644 --- a/mlir/test/Conversion/SCFToEmitC/switch.mlir +++ b/mlir/test/Conversion/SCFToEmitC/switch.mlir @@ -1,7 +1,8 @@ // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s // CHECK-LABEL: func.func @switch_no_result( -// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK-SAME: %[[ARG_0:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { // CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32 @@ -33,7 +34,8 @@ func.func @switch_no_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_one_result( -// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK-SAME: %[[ARG_0:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] // CHECK: case 2 { @@ -70,7 +72,8 @@ func.func @switch_one_result(%arg0 : index) { } // CHECK-LABEL: func.func @switch_two_results( -// CHECK-SAME: %[[VAL_0:.*]]: index) -> (i32, f32) { +// CHECK-SAME: %[[ARG_0:.*]]: index) -> (i32, f32) { +// CHECK: %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t // CHECK: %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue // CHECK: emitc.switch %[[VAL_0]] From a4deb809be8f5ec3adec3626e9d700f6168d0e9f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 2 Jan 2025 09:13:40 -0500 Subject: [PATCH 002/480] Revert "Reapply "[Driver][OHOS] Fix lld link issue for OHOS (#118192)" (#120159)" This reverts commit bd154e823eba4d62366dfa3d56ae0b99ab171b96. Test fails with -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=OFF, see https://github.com/llvm/llvm-project/pull/120159#issuecomment-2567836727 --- clang/lib/Driver/ToolChains/OHOS.cpp | 60 ++++++++++++++++------------ 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp index c9a532771b99e..6e1a09ae908b2 100644 --- a/clang/lib/Driver/ToolChains/OHOS.cpp +++ b/clang/lib/Driver/ToolChains/OHOS.cpp @@ -19,8 +19,8 @@ #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" -#include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/ScopedPrinter.h" using namespace clang::driver; using namespace clang::driver::toolchains; @@ -58,9 +58,11 @@ static bool findOHOSMuslMultilibs(const Driver &D, return false; } -static bool findOHOSMultilibs(const Driver &D, const ToolChain &TC, - const llvm::Triple &TargetTriple, StringRef Path, - const ArgList &Args, DetectedMultilibs &Result) { +static bool findOHOSMultilibs(const Driver &D, + const ToolChain &TC, + const llvm::Triple &TargetTriple, + StringRef Path, const ArgList &Args, + DetectedMultilibs &Result) { Multilib::flags_list Flags; bool IsA7 = false; if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) @@ -170,7 +172,8 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) Paths); } -ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(const ArgList &Args) const { +ToolChain::RuntimeLibType OHOS::GetRuntimeLibType( + const ArgList &Args) const { if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") @@ -181,19 +184,20 @@ ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(const ArgList &Args) const { return ToolChain::RLT_CompilerRT; } -ToolChain::CXXStdlibType OHOS::GetCXXStdlibType(const ArgList &Args) const { +ToolChain::CXXStdlibType +OHOS::GetCXXStdlibType(const ArgList &Args) const { if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) { StringRef Value = A->getValue(); if (Value != "libc++") getDriver().Diag(diag::err_drv_invalid_stdlib_name) - << A->getAsString(Args); + << A->getAsString(Args); } return ToolChain::CST_Libcxx; } void OHOS::AddClangSystemIncludeArgs(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args) const { const Driver &D = getDriver(); const llvm::Triple &Triple = getTriple(); std::string SysRoot = computeSysRoot(); @@ -254,7 +258,7 @@ void OHOS::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, } void OHOS::AddCXXStdlibLibArgs(const ArgList &Args, - ArgStringList &CmdArgs) const { + ArgStringList &CmdArgs) const { switch (GetCXXStdlibType(Args)) { case ToolChain::CST_Libcxx: CmdArgs.push_back("-lc++"); @@ -287,8 +291,7 @@ ToolChain::path_list OHOS::getRuntimePaths() const { // First try the triple passed to driver as --target=. P.assign(D.ResourceDir); - llvm::sys::path::append(P, "lib", D.getTargetTriple(), - SelectedMultilib.gccSuffix()); + llvm::sys::path::append(P, "lib", D.getTargetTriple(), SelectedMultilib.gccSuffix()); Paths.push_back(P.c_str()); // Second try the normalized triple. @@ -337,20 +340,26 @@ std::string OHOS::getDynamicLinker(const ArgList &Args) const { std::string OHOS::getCompilerRT(const ArgList &Args, StringRef Component, FileType Type) const { - std::string CRTBasename = - buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false); - SmallString<128> Path(getDriver().ResourceDir); llvm::sys::path::append(Path, "lib", getMultiarchTriple(getTriple()), - SelectedMultilib.gccSuffix(), CRTBasename); - if (getVFS().exists(Path)) - return std::string(Path); - - std::string NewPath = ToolChain::getCompilerRT(Args, Component, Type); - if (getVFS().exists(NewPath)) - return NewPath; - - return std::string(Path); + SelectedMultilib.gccSuffix()); + const char *Prefix = + Type == ToolChain::FT_Object ? "" : "lib"; + const char *Suffix; + switch (Type) { + case ToolChain::FT_Object: + Suffix = ".o"; + break; + case ToolChain::FT_Static: + Suffix = ".a"; + break; + case ToolChain::FT_Shared: + Suffix = ".so"; + break; + } + llvm::sys::path::append( + Path, Prefix + Twine("clang_rt.") + Component + Suffix); + return static_cast(Path.str()); } void OHOS::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const { @@ -387,7 +396,7 @@ SanitizerMask OHOS::getSupportedSanitizers() const { // TODO: Make a base class for Linux and OHOS and move this there. void OHOS::addProfileRTLibs(const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) const { + llvm::opt::ArgStringList &CmdArgs) const { // Add linker option -u__llvm_profile_runtime to cause runtime // initialization module to be linked in. if (needsProfileRT(Args)) @@ -404,8 +413,7 @@ ToolChain::path_list OHOS::getArchSpecificLibPaths() const { return Paths; } -ToolChain::UnwindLibType -OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const { +ToolChain::UnwindLibType OHOS::GetUnwindLibType(const llvm::opt::ArgList &Args) const { if (Args.getLastArg(options::OPT_unwindlib_EQ)) return Generic_ELF::GetUnwindLibType(Args); return GetDefaultUnwindLibType(); From 6d604ba36326de849ccf00f30351ce21fde19471 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 2 Jan 2025 14:14:43 +0000 Subject: [PATCH 003/480] [gn build] Port e45e091b9089 --- .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn | 1 + .../clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index 61e4f8da3c04d..670f24c242a89 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -54,6 +54,7 @@ static_library("bugprone") { "MultiLevelImplicitPointerConversionCheck.cpp", "MultipleNewInOneExpressionCheck.cpp", "MultipleStatementMacroCheck.cpp", + "NarrowingConversionsCheck.cpp", "NoEscapeCheck.cpp", "NonZeroEnumToBoolConversionCheck.cpp", "NondeterministicPointerIterationOrderCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn index be444d47aa12a..a06b2f11b452a 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn @@ -28,7 +28,6 @@ static_library("cppcoreguidelines") { "MacroUsageCheck.cpp", "MisleadingCaptureDefaultByValueCheck.cpp", "MissingStdForwardCheck.cpp", - "NarrowingConversionsCheck.cpp", "NoMallocCheck.cpp", "NoSuspendWithLockCheck.cpp", "OwningMemoryCheck.cpp", From 073e65a8e5f92ca9c63c3fcd1c0ce2a36913f9a6 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Thu, 2 Jan 2025 14:31:36 +0000 Subject: [PATCH 004/480] [LoopVectorize] Make needsExtract notice scalarized instructions (#119720) LoopVectorizationCostModel::needsExtract should recognise instructions that have been widened by scalarizing as scalar instructions, and thus not needing an extract when used by later scalarized instructions. This fixes an incorrect cost calculation in computePredInstDiscount, where we are adding a scalarization overhead cost when we shouldn't, though I haven't come up with a test case where it makes a difference. It will make a difference when the cost model switches to using the cost kind TCK_CodeSize for optsize, as not doing this causes the test LoopVectorize/X86/small-size.ll to get worse. --- .../Transforms/Vectorize/LoopVectorize.cpp | 3 +- .../LoopVectorize/AArch64/interleaved_cost.ll | 4 +- .../LoopVectorize/ARM/mve-interleaved-cost.ll | 320 +++++++++--------- 3 files changed, 164 insertions(+), 163 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 52186882b4f20..f2f8a85b7cc23 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1731,7 +1731,8 @@ class LoopVectorizationCostModel { bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || - TheLoop->isLoopInvariant(I)) + TheLoop->isLoopInvariant(I) || + getWideningDecision(I, VF) == CM_Scalarize) return false; // Assume we can vectorize V (and hence we need extraction) if the diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index dec124b55cd4e..a550f1ca14c8b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -170,8 +170,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i64_factor_8' ; VF_2: Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index 976c6a9a570af..551b85b7d0357 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -17,8 +17,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i8_factor_2' ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 @@ -58,8 +58,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i16_factor_2' ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 @@ -99,8 +99,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'i32_factor_2' ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 @@ -140,23 +140,23 @@ entry: ; VF_2-LABEL: Checking a loop in 'i64_factor_2' ; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_2' ; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_2' ; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_2' ; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0 @@ -181,8 +181,8 @@ entry: ; VF_2-LABEL: Checking a loop in 'f16_factor_2' ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_2' ; VF_4: Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2 @@ -263,23 +263,23 @@ entry: ; VF_2-LABEL: Checking a loop in 'f64_factor_2' ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_2' ; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_2' ; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_2' ; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0 @@ -309,30 +309,30 @@ entry: ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_3' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_3' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_3' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0 @@ -361,30 +361,30 @@ entry: ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_3' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_3' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_3' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0 @@ -413,9 +413,9 @@ entry: ; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_3' ; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 @@ -427,16 +427,16 @@ entry: ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_3' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0 @@ -465,30 +465,30 @@ entry: ; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_3' ; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_3' ; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_3' ; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0 @@ -517,9 +517,9 @@ entry: ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_3' ; VF_4: Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2 @@ -621,30 +621,30 @@ entry: ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_3' ; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_3' ; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_3' ; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0 @@ -677,37 +677,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_4' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_4' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_4' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0 @@ -740,37 +740,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_4' ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_4' ; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_4' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0 @@ -803,10 +803,10 @@ entry: ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_4' ; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 @@ -821,19 +821,19 @@ entry: ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_4' ; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0 @@ -866,37 +866,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_4' ; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_4' ; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_4' ; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0 @@ -1055,37 +1055,37 @@ entry: ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 4 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_4' ; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_4' ; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_4' ; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0 From a9a3fb5b1a23e336a1656046ba1a36832e020d4e Mon Sep 17 00:00:00 2001 From: Bart Chrzaszcz Date: Thu, 2 Jan 2025 14:38:43 +0000 Subject: [PATCH 005/480] Update BUILD.bazel due to PR #121476 Breaks bazel builds due to missing dependency --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f1192d069fa5f..e823af2f14712 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -8391,6 +8391,7 @@ cc_library( ":ArithDialect", ":ConversionPassIncGen", ":EmitCDialect", + ":EmitCTransforms", ":IR", ":SCFDialect", ":TransformUtils", From bb27d5e5c6b194a1440b8ac4e5ace68d0ee2a849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Thu, 2 Jan 2025 15:51:03 +0100 Subject: [PATCH 006/480] [analyzer] Don't assume third iteration in loops (#119388) This commit ensures that if the loop condition is opaque (the analyzer cannot determine whether it's true or false) and there were at least two iterations, then the analyzer doesn't make the unjustified assumption that it can enter yet another iteration. Note that the presence of a loop suggests that the developer thought that two iterations can happen (otherwise an `if` would've been sufficient), but it does not imply that the developer expected three or four iterations -- and in fact there are many false positives where a loop iterates over a two-element (or three-element) data structure, but the analyzer cannot understand the loop condition and blindly assumes that there may be three or more iterations. (In particular, analyzing the FFMPEG project produces 100+ such false positives.) Moreover, this provides some performance improvements in the sense that the analyzer won't waste time on traversing the execution paths with 3 or 4 iterations in a loop (which are very similar to the paths with 2 iterations) and therefore will be able to traverse more branches elsewhere on the `ExplodedGraph`. This logic is disabled if the user enables the widen-loops analyzer option (which is disabled by default), because the "simulate one final iteration after the invalidation" execution path would be suppressed by the "exit the loop if the loop condition is opaque and there were at least two iterations" logic. If we want to support loop widening, we would need to create a follow-up commit which ensures that it "plays nicely" with this logic. --- clang/docs/ReleaseNotes.rst | 7 + .../Core/PathSensitive/CoreEngine.h | 8 + .../Core/PathSensitive/ExprEngine.h | 18 +- clang/lib/StaticAnalyzer/Core/CoreEngine.cpp | 27 ++- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 58 ++++- clang/test/Analysis/loop-assumptions.c | 219 ++++++++++++++++++ clang/test/Analysis/loop-unrolling.cpp | 35 +-- clang/test/Analysis/misc-ps-region-store.m | 31 ++- 8 files changed, 362 insertions(+), 41 deletions(-) create mode 100644 clang/test/Analysis/loop-assumptions.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e0aef1af2135c..aca07e2ba9cf2 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1157,6 +1157,13 @@ New features Crash and bug fixes ^^^^^^^^^^^^^^^^^^^ +- In loops where the loop condition is opaque (i.e. the analyzer cannot + determine whether it's true or false), the analyzer will no longer assume + execution paths that perform more that two iterations. These unjustified + assumptions caused false positive reports (e.g. 100+ out-of-bounds reports in + the FFMPEG codebase) in loops where the programmer intended only two or three + steps but the analyzer wasn't able to understand that the loop is limited. + Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h index a6d05a3ac67b4..80b79fd4e928f 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h @@ -126,6 +126,14 @@ class CoreEngine { ExplodedNode *generateCallExitBeginNode(ExplodedNode *N, const ReturnStmt *RS); + /// Helper function called by `HandleBranch()`. If the currently handled + /// branch corresponds to a loop, this returns the number of already + /// completed iterations in that loop, otherwise the return value is + /// `std::nullopt`. Note that this counts _all_ earlier iterations, including + /// ones that were performed within an earlier iteration of an outer loop. + std::optional getCompletedIterationCount(const CFGBlock *B, + ExplodedNode *Pred) const; + public: /// Construct a CoreEngine object to analyze the provided CFG. CoreEngine(ExprEngine &exprengine, diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h index 8c7493e27fcaa..20c446e33ef9a 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h @@ -321,14 +321,14 @@ class ExprEngine { NodeBuilderWithSinks &nodeBuilder, ExplodedNode *Pred); - /// ProcessBranch - Called by CoreEngine. Used to generate successor - /// nodes by processing the 'effects' of a branch condition. - void processBranch(const Stmt *Condition, - NodeBuilderContext& BuilderCtx, - ExplodedNode *Pred, - ExplodedNodeSet &Dst, - const CFGBlock *DstT, - const CFGBlock *DstF); + /// ProcessBranch - Called by CoreEngine. Used to generate successor nodes by + /// processing the 'effects' of a branch condition. If the branch condition + /// is a loop condition, IterationsCompletedInLoop is the number of completed + /// iterations (otherwise it's std::nullopt). + void processBranch(const Stmt *Condition, NodeBuilderContext &BuilderCtx, + ExplodedNode *Pred, ExplodedNodeSet &Dst, + const CFGBlock *DstT, const CFGBlock *DstF, + std::optional IterationsCompletedInLoop); /// Called by CoreEngine. /// Used to generate successor nodes for temporary destructors depending @@ -588,6 +588,8 @@ class ExprEngine { void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, const Expr *Ex); + bool didEagerlyAssumeBifurcateAt(ProgramStateRef State, const Expr *Ex) const; + static std::pair getEagerlyAssumeBifurcationTags(); diff --git a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp index 67b7d30853d9d..775a22e18c619 100644 --- a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp @@ -444,7 +444,8 @@ void CoreEngine::HandleBranch(const Stmt *Cond, const Stmt *Term, NodeBuilderContext Ctx(*this, B, Pred); ExplodedNodeSet Dst; ExprEng.processBranch(Cond, Ctx, Pred, Dst, *(B->succ_begin()), - *(B->succ_begin() + 1)); + *(B->succ_begin() + 1), + getCompletedIterationCount(B, Pred)); // Enqueue the new frontier onto the worklist. enqueue(Dst); } @@ -591,6 +592,30 @@ ExplodedNode *CoreEngine::generateCallExitBeginNode(ExplodedNode *N, return isNew ? Node : nullptr; } +std::optional +CoreEngine::getCompletedIterationCount(const CFGBlock *B, + ExplodedNode *Pred) const { + const LocationContext *LC = Pred->getLocationContext(); + BlockCounter Counter = WList->getBlockCounter(); + unsigned BlockCount = + Counter.getNumVisited(LC->getStackFrame(), B->getBlockID()); + + const Stmt *Term = B->getTerminatorStmt(); + if (isa(Term)) { + assert(BlockCount >= 1 && + "Block count of currently analyzed block must be >= 1"); + return BlockCount - 1; + } + if (isa(Term)) { + // In a do-while loop one iteration happens before the first evaluation of + // the loop condition, so we don't subtract one. + return BlockCount; + } + // ObjCForCollectionStmt is skipped intentionally because the current + // application of the iteration counts is not relevant for it. + return std::nullopt; +} + void CoreEngine::enqueue(ExplodedNodeSet &Set) { for (const auto I : Set) WList->enqueue(I); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index db385e891e762..362a985b9174a 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -2760,12 +2760,10 @@ assumeCondition(const Stmt *Condition, ExplodedNode *N) { return State->assume(V); } -void ExprEngine::processBranch(const Stmt *Condition, - NodeBuilderContext& BldCtx, - ExplodedNode *Pred, - ExplodedNodeSet &Dst, - const CFGBlock *DstT, - const CFGBlock *DstF) { +void ExprEngine::processBranch( + const Stmt *Condition, NodeBuilderContext &BldCtx, ExplodedNode *Pred, + ExplodedNodeSet &Dst, const CFGBlock *DstT, const CFGBlock *DstF, + std::optional IterationsCompletedInLoop) { assert((!Condition || !isa(Condition)) && "CXXBindTemporaryExprs are handled by processBindTemporary."); const LocationContext *LCtx = Pred->getLocationContext(); @@ -2808,8 +2806,35 @@ void ExprEngine::processBranch(const Stmt *Condition, if (StTrue && StFalse) assert(!isa(Condition)); - if (StTrue) - Builder.generateNode(StTrue, true, PredN); + if (StTrue) { + // If we are processing a loop condition where two iterations have + // already been completed and the false branch is also feasible, then + // don't assume a third iteration because it is a redundant execution + // path (unlikely to be different from earlier loop exits) and can cause + // false positives if e.g. the loop iterates over a two-element structure + // with an opaque condition. + // + // The iteration count "2" is hardcoded because it's the natural limit: + // * the fact that the programmer wrote a loop (and not just an `if`) + // implies that they thought that the loop body might be executed twice; + // * however, there are situations where the programmer knows that there + // are at most two iterations but writes a loop that appears to be + // generic, because there is no special syntax for "loop with at most + // two iterations". (This pattern is common in FFMPEG and appears in + // many other projects as well.) + bool CompletedTwoIterations = IterationsCompletedInLoop.value_or(0) >= 2; + bool FalseAlsoFeasible = + StFalse || + didEagerlyAssumeBifurcateAt(PrevState, dyn_cast(Condition)); + bool SkipTrueBranch = CompletedTwoIterations && FalseAlsoFeasible; + + // FIXME: This "don't assume third iteration" heuristic partially + // conflicts with the widen-loop analysis option (which is off by + // default). If we intend to support and stabilize the loop widening, + // we must ensure that it 'plays nicely' with this logic. + if (!SkipTrueBranch || AMgr.options.ShouldWidenLoops) + Builder.generateNode(StTrue, true, PredN); + } if (StFalse) Builder.generateNode(StFalse, false, PredN); @@ -3731,6 +3756,12 @@ ExprEngine::getEagerlyAssumeBifurcationTags() { return std::make_pair(&TrueTag, &FalseTag); } +/// If the last EagerlyAssume attempt was successful (i.e. the true and false +/// cases were both feasible), this state trait stores the expression where it +/// happened; otherwise this holds nullptr. +REGISTER_TRAIT_WITH_PROGRAMSTATE(LastEagerlyAssumeExprIfSuccessful, + const Expr *) + void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, const Expr *Ex) { @@ -3746,6 +3777,7 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, } ProgramStateRef State = Pred->getState(); + State = State->set(nullptr); SVal V = State->getSVal(Ex, Pred->getLocationContext()); std::optional SEV = V.getAs(); if (SEV && SEV->isExpression()) { @@ -3753,6 +3785,11 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, auto [StateTrue, StateFalse] = State->assume(*SEV); + if (StateTrue && StateFalse) { + StateTrue = StateTrue->set(Ex); + StateFalse = StateFalse->set(Ex); + } + // First assume that the condition is true. if (StateTrue) { SVal Val = svalBuilder.makeIntVal(1U, Ex->getType()); @@ -3770,6 +3807,11 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, } } +bool ExprEngine::didEagerlyAssumeBifurcateAt(ProgramStateRef State, + const Expr *Ex) const { + return Ex && State->get() == Ex; +} + void ExprEngine::VisitGCCAsmStmt(const GCCAsmStmt *A, ExplodedNode *Pred, ExplodedNodeSet &Dst) { StmtNodeBuilder Bldr(Pred, Dst, *currBldrCtx); diff --git a/clang/test/Analysis/loop-assumptions.c b/clang/test/Analysis/loop-assumptions.c new file mode 100644 index 0000000000000..eb0ffdce722e0 --- /dev/null +++ b/clang/test/Analysis/loop-assumptions.c @@ -0,0 +1,219 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ +// RUN: -verify=expected,eagerlyassume %s +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config eagerly-assume=false \ +// RUN: -verify=expected,noeagerlyassume %s + +// These tests validate the logic within `ExprEngine::processBranch` which +// ensures that in loops with opaque conditions we don't assume execution paths +// if the code does not imply that they are possible. + +void clang_analyzer_numTimesReached(void); +void clang_analyzer_warnIfReached(void); +void clang_analyzer_dump(int); + +void clearCondition(void) { + // If the analyzer can definitely determine the value of the loop condition, + // then this corrective logic doesn't activate and the engine executes + // `-analyzer-max-loop` iterations (by default, 4). + for (int i = 0; i < 10; i++) + clang_analyzer_numTimesReached(); // expected-warning {{4}} + + clang_analyzer_warnIfReached(); // unreachable +} + +void opaqueCondition(int arg) { + // If the loop condition is opaque, don't assume more than two iterations, + // because the presence of a loop does not imply that the programmer thought + // that more than two iterations are possible. (It _does_ imply that two + // iterations may be possible at least in some cases, because otherwise an + // `if` would've been enough.) + for (int i = 0; i < arg; i++) + clang_analyzer_numTimesReached(); // expected-warning {{2}} + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +int check(void); + +void opaqueConditionCall(int arg) { + // Same situation as `opaqueCondition()` but with a `while ()` loop. This + // is also an example for a situation where the programmer cannot easily + // insert an assertion to guide the analyzer and rule out more than two + // iterations (so the analyzer needs to proactively avoid those unjustified + // branches). + while (check()) + clang_analyzer_numTimesReached(); // expected-warning {{2}} + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void opaqueConditionDoWhile(int arg) { + // Same situation as `opaqueCondition()` but with a `do {} while ()` loop. + // This is tested separately because this loop type is a special case in the + // iteration count calculation. + int i = 0; + do { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } while (i++ < arg); + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void dontRememberOldBifurcation(int arg) { + // In this (slightly contrived) test case the analyzer performs an assumption + // at the first iteration of the loop, but does not make any new assumptions + // in the subsequent iterations, so the analyzer should continue evaluating + // the loop. + // Previously this was mishandled in `eagerly-assume` mode (which is enabled + // by default), because the code remembered that there was a bifurcation on + // the first iteration of the loop and didn't realize that this is obsolete. + + // NOTE: The variable `i` is introduced to ensure that the iterations of the + // loop change the state -- otherwise the analyzer stops iterating because it + // returns to the same `ExplodedNode`. + int i = 0; + while (arg > 3) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + i++; + } + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void dontAssumeFourthIterartion(int arg) { + if (arg == 2) + return; + + // In this function the analyzer cannot leave the loop after exactly two + // iterations (because it knows that `arg != 2` at that point), so it + // performs a third iteration, but it does not assume that a fourth iteration + // is also possible. + for (int i = 0; i < arg; i++) + clang_analyzer_numTimesReached(); // expected-warning {{3}} + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +#define TRUE 1 +void shortCircuitInLoopCondition(int arg) { + // When the loop condition expression contains short-circuiting operators, it + // performs "inner" bifurcations for those operators and only considers the + // last (rightmost) operand as the branch condition that is associated with + // the loop itself (as its loop condition). + // This means that assumptions taken in the left-hand side of a short-circuiting + // operator are not recognized as "opaque" loop condition, so the loop in + // this test case is allowed to finish four iterations. + // FIXME: This corner case is responsible for at least one out-of-bounds + // false positive on the ffmpeg codebase. Eventually we should properly + // recognize the full syntactical loop condition expression as "the loop + // condition", but this will be complicated to implement. + for (int i = 0; i < arg && TRUE; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + } + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void shortCircuitInLoopConditionRHS(int arg) { + // Unlike `shortCircuitInLoopCondition()`, this case is handled properly + // because the analyzer thinks that the right hand side of the `&&` is the + // loop condition. + for (int i = 0; TRUE && i < arg; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void eagerlyAssumeInSubexpression(int arg) { + // The `EagerlyAssume` logic is another complication that can "split the + // state" within the loop condition, but before the `processBranch()` call + // which is (in theory) responsible for evaluating the loop condition. + // The current implementation partially compensates this by noticing the + // cases where the loop condition is targeted by `EagerlyAssume`, but does + // not handle the (fortunately rare) case when `EagerlyAssume` hits a + // sub-expression of the loop condition (as in this contrived test case). + // FIXME: I don't know a real-world example for this inconsistency, but it + // would be good to eliminate it eventually. + for (int i = 0; (i >= arg) - 1; i++) { + clang_analyzer_numTimesReached(); // eagerlyassume-warning {{4}} noeagerlyassume-warning {{2}} + } + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} + +void calledTwice(int arg, int isFirstCall) { + // This function is called twice (with two different unknown 'arg' values) to + // check the iteration count handling in this situation. + for (int i = 0; i < arg; i++) { + if (isFirstCall) { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } else { + clang_analyzer_numTimesReached(); // expected-warning {{2}} + } + } +} + +void caller(int arg, int arg2) { + // Entry point for `calledTwice()`. + calledTwice(arg, 1); + calledTwice(arg2, 0); +} + +void innerLoopClearCondition(void) { + // A "control group" test case for the behavior of an inner loop. Notice that + // although the (default) value of `-analyzer-max-loop` is 4, we only see 3 iterations + // of the inner loop, because `-analyzer-max-loop` limits the number of + // evaluations of _the loop condition of the inner loop_ and in addition to + // the 3 evaluations before the 3 iterations, there is also a step where it + // evaluates to false (in the first iteration of the outer loop). + for (int outer = 0; outer < 2; outer++) { + int limit = 0; + if (outer) + limit = 10; + clang_analyzer_dump(limit); // expected-warning {{0}} expected-warning {{10}} + for (int i = 0; i < limit; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{3}} + } + } +} + +void innerLoopOpaqueCondition(int arg) { + // In this test case the engine doesn't assume a second iteration within the + // inner loop (in the second iteration of the outer loop, when the limit is + // opaque) because `CoreEngine::getCompletedIterationCount()` is based on the + // `BlockCount` values queried from the `BlockCounter` which count _all_ + // evaluations of a given `CFGBlock` (in our case, the loop condition) and + // not just the evaluations within the current iteration of the outer loop. + // FIXME: This inaccurate iteration count could in theory cause some false + // negatives, although I think this would be unusual in practice, as the + // small default value of `-analyzer-max-loop` means that this is only + // relevant if the analyzer can deduce that the inner loop performs 0 or 1 + // iterations within the first iteration of the outer loop (and then the + // condition of the inner loop is opaque within the second iteration of the + // outer loop). + for (int outer = 0; outer < 2; outer++) { + int limit = 0; + if (outer) + limit = arg; + clang_analyzer_dump(limit); // expected-warning {{0}} expected-warning {{reg_$}} + for (int i = 0; i < limit; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{1}} + } + } +} + +void onlyLoopConditions(int arg) { + // This "don't assume third iteration" logic only examines the conditions of + // loop statements and does not affect the analysis of code that implements + // similar behavior with different language features like if + break, goto, + // recursive functions, ... + int i = 0; + while (1) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + + // This is not a loop condition. + if (i++ > arg) + break; + } + + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} +} diff --git a/clang/test/Analysis/loop-unrolling.cpp b/clang/test/Analysis/loop-unrolling.cpp index 66a828abfb513..bf05a7739ce48 100644 --- a/clang/test/Analysis/loop-unrolling.cpp +++ b/clang/test/Analysis/loop-unrolling.cpp @@ -63,7 +63,7 @@ int simple_no_unroll1() { int a[9]; int k = 42; for (int i = 0; i < 9; i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} a[i] = 42; foo(i); } @@ -76,7 +76,7 @@ int simple_no_unroll2() { int k = 42; int i; for (i = 0; i < 9; i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} a[i] = 42; i += getNum(); } @@ -309,9 +309,9 @@ int nested_inner_unrolled() { int k = 42; int j = 0; for (int i = 0; i < getNum(); i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} for (j = 0; j < 8; ++j) { - clang_analyzer_numTimesReached(); // expected-warning {{32}} + clang_analyzer_numTimesReached(); // expected-warning {{16}} a[j] = 22; } a[i] = 42; @@ -346,11 +346,7 @@ int simple_known_bound_loop() { int simple_unknown_bound_loop() { for (int i = 2; i < getNum(); i++) { -#ifdef DFS - clang_analyzer_numTimesReached(); // expected-warning {{16}} -#else clang_analyzer_numTimesReached(); // expected-warning {{8}} -#endif } return 0; } @@ -368,11 +364,7 @@ int nested_inlined_unroll1() { int nested_inlined_no_unroll1() { int k; for (int i = 0; i < 9; i++) { -#ifdef DFS - clang_analyzer_numTimesReached(); // expected-warning {{18}} -#else - clang_analyzer_numTimesReached(); // expected-warning {{14}} -#endif + clang_analyzer_numTimesReached(); // expected-warning {{10}} k = simple_unknown_bound_loop(); // reevaluation without inlining, splits the state as well } int a = 22 / k; // no-warning @@ -475,9 +467,13 @@ int num_steps_over_limit2() { int num_steps_on_limit3() { for (int i = 0; i < getNum(); i++) { - clang_analyzer_numTimesReached(); // expected-warning {{4}} + clang_analyzer_numTimesReached(); // expected-warning {{2}} for (int j = 0; j < 32; j++) { - clang_analyzer_numTimesReached(); // expected-warning {{128}} + // Here the loop unrollig logic calculates with four potential iterations + // in the outer loop where it cannot determine the iteration count in + // advance; but after two loops the analyzer conservatively assumes that + // the (still opaque) loop condition is false. + clang_analyzer_numTimesReached(); // expected-warning {{64}} } } return 0; @@ -493,6 +489,15 @@ int num_steps_over_limit3() { return 0; } +int num_steps_on_limit4() { + for (int i = 0; i < 4; i++) { + clang_analyzer_numTimesReached(); // expected-warning {{4}} + for (int j = 0; j < 32; j++) { + clang_analyzer_numTimesReached(); // expected-warning {{128}} + } + } + return 0; +} void pr34943() { for (int i = 0; i < 6L; ++i) { diff --git a/clang/test/Analysis/misc-ps-region-store.m b/clang/test/Analysis/misc-ps-region-store.m index 668b5ffd7001a..a882e7eb0dc90 100644 --- a/clang/test/Analysis/misc-ps-region-store.m +++ b/clang/test/Analysis/misc-ps-region-store.m @@ -910,13 +910,13 @@ void pr6302(id x, Class y) { //===----------------------------------------------------------------------===// // Specially handle global variables that are declared constant. In the -// example below, this forces the loop to take exactly 2 iterations. +// example below, this forces the loop to take exactly 1 iteration. //===----------------------------------------------------------------------===// -const int pr6288_L_N = 2; +const int pr6288_L_N = 1; void pr6288_(void) { - int x[2]; - int *px[2]; + int x[1]; + int *px[1]; int i; for (i = 0; i < pr6288_L_N; i++) px[i] = &x[i]; @@ -924,8 +924,8 @@ void pr6288_(void) { } void pr6288_pos(int z) { - int x[2]; - int *px[2]; + int x[1]; + int *px[1]; int i; for (i = 0; i < z; i++) px[i] = &x[i]; // expected-warning{{Access out-of-bound array element (buffer overflow)}} @@ -933,15 +933,28 @@ void pr6288_pos(int z) { } void pr6288_b(void) { - const int L_N = 2; - int x[2]; - int *px[2]; + const int L_N = 1; + int x[1]; + int *px[1]; int i; for (i = 0; i < L_N; i++) px[i] = &x[i]; *(px[0]) = 0; // no-warning } +void pr6288_no_third_iter(int z) { + int x[2]; + int *px[2]; + int i; + // If the loop condition is opaque, we assume that there may be two + // iterations (becasuse otherwise the loop could be replaced by an if); but + // we do not assume that there may be a third iteration. Therefore, + // unlike 'pr6288_pos', this testcase does not produce an out-of-bounds error. + for (i = 0; i < z; i++) + px[i] = &x[i]; + *(px[0]) = 0; // expected-warning{{Dereference of undefined pointer value}} +} + // A bug in RemoveDeadBindings was causing instance variable bindings to get // prematurely pruned from the state. @interface Rdar7817800 { From 4a890c2c605640f48ecbaefebda8f3a42043ff3d Mon Sep 17 00:00:00 2001 From: klensy Date: Thu, 2 Jan 2025 18:18:20 +0300 Subject: [PATCH 007/480] [llvm][aarch64] fix copypaste typo (#120725) moved from #119881 --- .../lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 24e1ebd8421fb..070163a5fb297 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18430,7 +18430,7 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) { EVT VT = A.getValueType(); SDValue Op0 = A.getOperand(0); SDValue Op1 = A.getOperand(1); - if (Op0.getOpcode() != Op0.getOpcode() || + if (Op0.getOpcode() != Op1.getOpcode() || (Op0.getOpcode() != ISD::ZERO_EXTEND && Op0.getOpcode() != ISD::SIGN_EXTEND)) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 8473f45f6c803..5d6b523f1549a 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -72,6 +72,24 @@ entry: ret i64 %z } +define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) { +; CHECK-LABEL: add_v4i32_v4i64_zsext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> + %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> + %xx = zext <2 x i32> %x to <2 x i64> + %yy = sext <2 x i32> %y to <2 x i64> + %zz = add <2 x i64> %xx, %yy + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz) + ret i64 %z +} + define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: // %bb.0: // %entry From 62d0aff3eb934439acac47348e2385f0751a1444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Thu, 2 Jan 2025 15:43:45 +0000 Subject: [PATCH 008/480] [cmake] Extend zstd.dll finding logic from MSVC to Clang (#121437) Extend the special logic for finding `zstd.dll` in `Findzstd` to apply to all MSVC-compatible configurations such as Clang targeting MSVC. Fixes #121345 --- llvm/cmake/modules/Findzstd.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/cmake/modules/Findzstd.cmake b/llvm/cmake/modules/Findzstd.cmake index 86b6d48b6ec6b..f6ca5d1ebe546 100644 --- a/llvm/cmake/modules/Findzstd.cmake +++ b/llvm/cmake/modules/Findzstd.cmake @@ -10,7 +10,7 @@ # zstd::libzstd_shared # zstd::libzstd_static -if(MSVC) +if(MSVC OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC") set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") else() set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") @@ -33,7 +33,7 @@ if(zstd_FOUND) set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}") elseif (NOT TARGET zstd::libzstd_shared) add_library(zstd::libzstd_shared SHARED IMPORTED) - if(MSVC) + if(MSVC OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC") include(GNUInstallDirs) # For CMAKE_INSTALL_LIBDIR and friends. # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library". get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY) From 8ab88f11a12aaecb46f7b0eb5c13e7802258c1e1 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Thu, 2 Jan 2025 17:03:33 +0100 Subject: [PATCH 009/480] [emacs] Add noext as an attribute in llvm-mode.el (#121444) The NoExt attribute was introduced with #100757, to exist alongside with signext and zeroext. This patch adds "noext" as an attribute to llvm-mode.el to get the proper highlighting of the keyword. --- llvm/utils/emacs/llvm-mode.el | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/emacs/llvm-mode.el b/llvm/utils/emacs/llvm-mode.el index dab37833ff63a..660d0718f098c 100644 --- a/llvm/utils/emacs/llvm-mode.el +++ b/llvm/utils/emacs/llvm-mode.el @@ -32,7 +32,7 @@ `(,(regexp-opt '("alwaysinline" "argmemonly" "allocsize" "builtin" "cold" "convergent" "dereferenceable" "dereferenceable_or_null" "hot" "immarg" "inaccessiblememonly" "inaccessiblemem_or_argmemonly" "inalloca" "inlinehint" "jumptable" "minsize" "mustprogress" "naked" "nobuiltin" "nonnull" "nocapture" - "nocallback" "nocf_check" "noduplicate" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn" + "nocallback" "nocf_check" "noduplicate" "noext" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn" "norecurse" "nosync" "noundef" "nounwind" "nosanitize_bounds" "nosanitize_coverage" "null_pointer_is_valid" "optdebug" "optforfuzzing" "optnone" "optsize" "preallocated" "readnone" "readonly" "returned" "returns_twice" "shadowcallstack" "signext" "speculatable" "speculative_load_hardening" "ssp" "sspreq" "sspstrong" "safestack" "sanitize_address" "sanitize_hwaddress" "sanitize_memtag" "sanitize_thread" "sanitize_memory" "strictfp" "swifterror" "uwtable" "vscale_range" "willreturn" "writeonly" "zeroext") 'symbols) . font-lock-constant-face) From 11e482c4a32be6a315e5bf2ae7599cf10eb84836 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 2 Jan 2025 23:04:44 +0700 Subject: [PATCH 010/480] RegAllocGreedy: Add dummy priority advisor for writing MIR tests (#121207) I regularly struggle reproducing failures in greedy due to changes in priority when resuming the allocation from MIR vs. a complete compilation starting at IR. That is, the fix in e0919b189bf2df4f97f22ba40260ab5153988b14 did not really fix the problem of the instruction distance mattering. Add a way to bypass all of the priority heuristics for MIR tests, by prioritizing only by virtual register number. Could also give this a more specific name, like PrioritizeLowVirtRegNumber --- llvm/lib/CodeGen/RegAllocGreedy.cpp | 6 +++ llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp | 35 +++++++++++- llvm/lib/CodeGen/RegAllocPriorityAdvisor.h | 14 ++++- .../dummy-regalloc-priority-advisor.mir | 54 +++++++++++++++++++ 4 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 4fa2bc76b38b4..95a7801c372f7 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -376,6 +376,12 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const { return Prio; } +unsigned DummyPriorityAdvisor::getPriority(const LiveInterval &LI) const { + // Prioritize by virtual register number, lowest first. + Register Reg = LI.reg(); + return ~Reg.virtRegIndex(); +} + const LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); } const LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) { diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp index 0650aaff56ea0..4525b8fc5a383 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp @@ -30,7 +30,10 @@ static cl::opt Mode( clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development, - "development", "for training"))); + "development", "for training"), + clEnumValN( + RegAllocPriorityAdvisorAnalysis::AdvisorMode::Dummy, "dummy", + "prioritize low virtual register numbers for test and debug"))); char RegAllocPriorityAdvisorAnalysis::ID = 0; INITIALIZE_PASS(RegAllocPriorityAdvisorAnalysis, "regalloc-priority", @@ -67,6 +70,31 @@ class DefaultPriorityAdvisorAnalysis final } const bool NotAsRequested; }; + +class DummyPriorityAdvisorAnalysis final + : public RegAllocPriorityAdvisorAnalysis { +public: + DummyPriorityAdvisorAnalysis() + : RegAllocPriorityAdvisorAnalysis(AdvisorMode::Dummy) {} + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocPriorityAdvisorAnalysis *R) { + return R->getAdvisorMode() == AdvisorMode::Dummy; + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + RegAllocPriorityAdvisorAnalysis::getAnalysisUsage(AU); + } + + std::unique_ptr + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + return std::make_unique( + MF, RA, &getAnalysis().getSI()); + } +}; + } // namespace template <> Pass *llvm::callDefaultCtor() { @@ -75,6 +103,9 @@ template <> Pass *llvm::callDefaultCtor() { case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Default: Ret = new DefaultPriorityAdvisorAnalysis(/*NotAsRequested*/ false); break; + case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Dummy: + Ret = new DummyPriorityAdvisorAnalysis(); + break; case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development: #if defined(LLVM_HAVE_TFLITE) Ret = createDevelopmentModePriorityAdvisor(); @@ -97,6 +128,8 @@ StringRef RegAllocPriorityAdvisorAnalysis::getPassName() const { return "Release mode Regalloc Priority Advisor"; case AdvisorMode::Development: return "Development mode Regalloc Priority Advisor"; + case AdvisorMode::Dummy: + return "Dummy Regalloc Priority Advisor"; } llvm_unreachable("Unknown advisor kind"); } diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h index 1e9fa967214cc..32e4598b71539 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h @@ -56,9 +56,21 @@ class DefaultPriorityAdvisor : public RegAllocPriorityAdvisor { unsigned getPriority(const LiveInterval &LI) const override; }; +/// Stupid priority advisor which just enqueues in virtual register number +/// order, for debug purposes only. +class DummyPriorityAdvisor : public RegAllocPriorityAdvisor { +public: + DummyPriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA, + SlotIndexes *const Indexes) + : RegAllocPriorityAdvisor(MF, RA, Indexes) {} + +private: + unsigned getPriority(const LiveInterval &LI) const override; +}; + class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { public: - enum class AdvisorMode : int { Default, Release, Development }; + enum class AdvisorMode : int { Default, Release, Development, Dummy }; RegAllocPriorityAdvisorAnalysis(AdvisorMode Mode) : ImmutablePass(ID), Mode(Mode){}; diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir new file mode 100644 index 0000000000000..5c7c07632f0d5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s + +# Check that the regalloc-enable-priority-advisor=dummy option works +# and the result is different from the default. Ordinarily %1 would be +# prioritized higher than %0 due to the register class priority + +--- +name: foo +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; DEFAULT-LABEL: name: foo + ; DEFAULT: liveins: $vgpr0, $vgpr1 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec + ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + ; + ; DUMMY-LABEL: name: foo + ; DUMMY: liveins: $vgpr0, $vgpr1 + ; DUMMY-NEXT: {{ $}} + ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec + ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + undef %1.sub0:vreg_128 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec + $vgpr3 = COPY %2 + SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + +... + +# CHECK: {{.*}} From 40ac34c518985f4ff119d2e67a5a412cc951104a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 2 Jan 2025 17:23:52 +0100 Subject: [PATCH 011/480] [libc++] Make __type_list variadic (#121117) This makes these lists signficiantly more readable. --- .../include/__type_traits/aligned_storage.h | 37 +++++++++---------- libcxx/include/__type_traits/make_signed.h | 22 +++++------ libcxx/include/__type_traits/make_unsigned.h | 22 +++++------ libcxx/include/__type_traits/type_list.h | 28 ++++++++------ 4 files changed, 53 insertions(+), 56 deletions(-) diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h index 2e39afb7f8808..5cd1f587b988c 100644 --- a/libcxx/include/__type_traits/aligned_storage.h +++ b/libcxx/include/__type_traits/aligned_storage.h @@ -34,26 +34,23 @@ struct __struct_double4 { double __lx[4]; }; -// clang-format off -typedef __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type, - __type_list<__align_type<__struct_double>, - __type_list<__align_type<__struct_double4>, - __type_list<__align_type, - __nat - > > > > > > > > > > __all_types; -// clang-format on +using __all_types = + __type_list<__align_type, + __align_type, + __align_type, + __align_type, + __align_type, + __align_type, + __align_type, + __align_type<__struct_double>, + __align_type<__struct_double4>, + __align_type >; template struct __find_max_align; -template -struct __find_max_align<__type_list<_Hp, __nat>, _Len> : public integral_constant {}; +template +struct __find_max_align<__type_list<_Head>, _Len> : public integral_constant {}; template struct __select_align { @@ -65,9 +62,11 @@ struct __select_align { static const size_t value = _Len < __max ? __min : __max; }; -template -struct __find_max_align<__type_list<_Hp, _Tp>, _Len> - : public integral_constant::value>::value> {}; +template +struct __find_max_align<__type_list<_Head, _Tail...>, _Len> + : public integral_constant< + size_t, + __select_align<_Len, _Head::value, __find_max_align<__type_list<_Tail...>, _Len>::value>::value> {}; template ::value> struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage { diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h index 8070690b3a7a9..5c2739e674352 100644 --- a/libcxx/include/__type_traits/make_signed.h +++ b/libcxx/include/__type_traits/make_signed.h @@ -29,21 +29,17 @@ template using __make_signed_t = __make_signed(_Tp); #else -// clang-format off -typedef __type_list + , + __int128_t # endif - > > > > > __signed_types; -// clang-format on + >; template ::value || is_enum<_Tp>::value> struct __make_signed{}; diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h index 562f7bab8a7fb..6c238685c2331 100644 --- a/libcxx/include/__type_traits/make_unsigned.h +++ b/libcxx/include/__type_traits/make_unsigned.h @@ -31,21 +31,17 @@ template using __make_unsigned_t = __make_unsigned(_Tp); #else -// clang-format off -typedef __type_list + , + __uint128_t # endif - > > > > > __unsigned_types; -// clang-format on + >; template ::value || is_enum<_Tp>::value> struct __make_unsigned{}; diff --git a/libcxx/include/__type_traits/type_list.h b/libcxx/include/__type_traits/type_list.h index b4898b36e2d90..34d78fc97c978 100644 --- a/libcxx/include/__type_traits/type_list.h +++ b/libcxx/include/__type_traits/type_list.h @@ -11,6 +11,7 @@ #include <__config> #include <__cstddef/size_t.h> +#include <__type_traits/enable_if.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -18,23 +19,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template -struct __type_list { - typedef _Hp _Head; - typedef _Tp _Tail; +template +struct __type_list {}; + +template +struct __type_list_head; + +template +struct __type_list_head<__type_list<_Head, _Tail...> > { + using type _LIBCPP_NODEBUG = _Head; }; -template +template ::type)> struct __find_first; -template -struct __find_first<__type_list<_Hp, _Tp>, _Size, true> { - using type _LIBCPP_NODEBUG = _Hp; +template +struct __find_first<__type_list<_Head, _Tail...>, _Size, true> { + using type _LIBCPP_NODEBUG = _Head; }; -template -struct __find_first<__type_list<_Hp, _Tp>, _Size, false> { - using type _LIBCPP_NODEBUG = typename __find_first<_Tp, _Size>::type; +template +struct __find_first<__type_list<_Head, _Tail...>, _Size, false> { + using type _LIBCPP_NODEBUG = typename __find_first<__type_list<_Tail...>, _Size>::type; }; _LIBCPP_END_NAMESPACE_STD From 4075ddad7183e6f0b66e2c8cc7a03b461a8038e6 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 2 Jan 2025 17:30:48 +0100 Subject: [PATCH 012/480] [libc++] Run clang-tidy only once per header (#121436) There doesn't seem to be much of a reason to run clang-tidy twice per headers, and running it only once makes the test a few seconds faster. --- libcxx/.clang-tidy | 2 ++ libcxx/test/libcxx/clang_tidy.gen.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libcxx/.clang-tidy b/libcxx/.clang-tidy index f986e2100ca6b..ebbfab0379265 100644 --- a/libcxx/.clang-tidy +++ b/libcxx/.clang-tidy @@ -5,6 +5,8 @@ Checks: > bugprone-stringview-nullptr, bugprone-use-after-move, + libcpp-*, + llvm-include-order, llvm-namespace-comment, diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py index 0db9c0d14b196..06f277e901d33 100644 --- a/libcxx/test/libcxx/clang_tidy.gen.py +++ b/libcxx/test/libcxx/clang_tidy.gen.py @@ -33,8 +33,7 @@ {lit_header_undeprecations.get(header, '')} // TODO: run clang-tidy with modules enabled once they are supported -// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules -// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules +// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- -Wweak-vtables %{{compile_flags}} -fno-modules #include <{header}> """) From 7326e903d72ba390a6368ff3e9eb2ab2251a1b13 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 2 Jan 2025 09:06:29 -0800 Subject: [PATCH 013/480] flang: fix backtrace build on FreeBSD (#120297) FreeBSD's libexecinfo defines backtrace with a size_t for the size argument and return type. This almost certainly doesn't make sense, but what's done is done so cast the output to allow compilation. Otherwise we get: .../flang/runtime/stop.cpp:165:13: error: non-constant-expression cannot be narrowed from type 'size_t' (aka 'unsigned long') to 'int' in initializer list [-Wc++11-narrowing] 165 | int nptrs{backtrace(buffer, MAX_CALL_STACK)}; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- flang/runtime/stop.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/runtime/stop.cpp b/flang/runtime/stop.cpp index f8457e10566a2..a7be8a082e026 100644 --- a/flang/runtime/stop.cpp +++ b/flang/runtime/stop.cpp @@ -162,7 +162,7 @@ static void PrintBacktrace() { // TODO: Need to parse DWARF information to print function line numbers constexpr int MAX_CALL_STACK{999}; void *buffer[MAX_CALL_STACK]; - int nptrs{backtrace(buffer, MAX_CALL_STACK)}; + int nptrs{(int)backtrace(buffer, MAX_CALL_STACK)}; if (char **symbols{backtrace_symbols(buffer, nptrs)}) { for (int i = 0; i < nptrs; i++) { From cbff02b101c20ad6557d64c998d03dab5ee4aad7 Mon Sep 17 00:00:00 2001 From: hatoo Date: Fri, 3 Jan 2025 02:13:27 +0900 Subject: [PATCH 014/480] [mlir][emitc] Fix invalid syntax in example of emitc.return (#121112) A return type of `emitc.func` must be specified with `->` instead of `:`. I've verified the syntax using `mlir-translate --mlir-to-cpp`. --- mlir/include/mlir/Dialect/EmitC/IR/EmitC.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td index 729a573b71c97..744a0dc4770e6 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td @@ -727,7 +727,7 @@ def EmitC_ReturnOp : EmitC_Op<"return", [Pure, HasParent<"FuncOp">, Example: ```mlir - emitc.func @foo() : (i32) { + emitc.func @foo() -> (i32) { ... emitc.return %0 : i32 } From 5ed6229019de43df0ff4b3e73097781e0f1a6651 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Jan 2025 12:46:54 +0000 Subject: [PATCH 015/480] [VectorCombine] Add scalarizeLoadExtract infinite loop test from #120984 regression scalarizeLoadExtract replaces instructions up the use list, which can result in the vectorcombine worklist adding users back to the worklist when they should really be erased first. --- .../X86/load-extractelement-scalarization.ll | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll new file mode 100644 index 0000000000000..0acfeccb92ef7 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s + +; infinite loop if we add the erased instructions to the work list in the wrong order. +define void @multiple_extract(ptr %p) { +; CHECK-LABEL: @multiple_extract( +; CHECK-NEXT: [[VP:%.*]] = load ptr, ptr [[P:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 0 +; CHECK-NEXT: [[E0:%.*]] = load i32, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 1 +; CHECK-NEXT: [[E1:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[E0]], ptr [[P]], align 4 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: store i32 [[E1]], ptr [[P1]], align 4 +; CHECK-NEXT: ret void +; + %vp = load ptr, ptr %p, align 8 + %v = load <2 x i32>, ptr %vp, align 16 + %e0 = extractelement <2 x i32> %v, i64 0 + %e1 = extractelement <2 x i32> %v, i64 1 + store i32 %e0, ptr %p, align 4 + %p1 = getelementptr inbounds nuw i8, ptr %p, i64 4 + store i32 %e1, ptr %p1, align 4 + ret void +} From f739aa4004165dc64d3a1f418d5ad3c84886f01a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Jan 2025 17:17:03 +0000 Subject: [PATCH 016/480] [VectorCombine] replaceValue - add "VC: Replacing" debug message to help the log show replacement for old/new. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index dd109637552c4..8509a31766e35 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -128,6 +128,8 @@ class VectorCombine { bool shrinkType(Instruction &I); void replaceValue(Value &Old, Value &New) { + LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); + LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); if (auto *NewI = dyn_cast(&New)) { New.takeName(&Old); From 1849244685bc42b07b1b14e3f62e15c535e74c39 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 2 Jan 2025 17:29:55 +0000 Subject: [PATCH 017/480] [CodeGen] Remove atEnd method from defusechain iterators (#120610) This was not used much and there are better ways of writing it. --- llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 6 ------ llvm/lib/CodeGen/MachineRegisterInfo.cpp | 8 +++++--- llvm/lib/CodeGen/MachineTraceMetrics.cpp | 9 ++++----- llvm/lib/CodeGen/SwiftErrorValueTracking.cpp | 2 +- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 5dc51aaed81c7..5ee3aef28a4fb 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -1095,9 +1095,6 @@ class MachineRegisterInfo { return !operator==(x); } - /// atEnd - return true if this iterator is equal to reg_end() on the value. - bool atEnd() const { return Op == nullptr; } - // Iterator traversal: forward iteration only defusechain_iterator &operator++() { // Preincrement assert(Op && "Cannot increment end iterator!"); @@ -1203,9 +1200,6 @@ class MachineRegisterInfo { return !operator==(x); } - /// atEnd - return true if this iterator is equal to reg_end() on the value. - bool atEnd() const { return Op == nullptr; } - // Iterator traversal: forward iteration only defusechain_instr_iterator &operator++() { // Preincrement assert(Op && "Cannot increment end iterator!"); diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 6f636a161f500..394b99b85ddcc 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -407,9 +407,11 @@ void MachineRegisterInfo::replaceRegWith(Register FromReg, Register ToReg) { MachineInstr *MachineRegisterInfo::getVRegDef(Register Reg) const { // Since we are in SSA form, we can use the first definition. def_instr_iterator I = def_instr_begin(Reg); - assert((I.atEnd() || std::next(I) == def_instr_end()) && - "getVRegDef assumes a single definition or no definition"); - return !I.atEnd() ? &*I : nullptr; + if (I == def_instr_end()) + return nullptr; + assert(std::next(I) == def_instr_end() && + "getVRegDef assumes at most one definition"); + return &*I; } /// getUniqueVRegDef - Return the unique machine instr that defines the diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp index 6576f97bea25f..021c1a058c020 100644 --- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -683,11 +683,10 @@ struct DataDep { DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp) : UseOp(UseOp) { assert(Register::isVirtualRegister(VirtReg)); - MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg); - assert(!DefI.atEnd() && "Register has no defs"); - DefMI = DefI->getParent(); - DefOp = DefI.getOperandNo(); - assert((++DefI).atEnd() && "Register has multiple defs"); + MachineOperand *DefMO = MRI->getOneDef(VirtReg); + assert(DefMO && "Register does not have unique def"); + DefMI = DefMO->getParent(); + DefOp = DefMO->getOperandNo(); } }; diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp index 74a94d6110f41..decffdc7dfe45 100644 --- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp +++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp @@ -259,7 +259,7 @@ void SwiftErrorValueTracking::propagateVRegs() { for (const auto &Use : VRegUpwardsUse) { const MachineBasicBlock *UseBB = Use.first.first; Register VReg = Use.second; - if (!MRI.def_begin(VReg).atEnd()) + if (!MRI.def_empty(VReg)) continue; #ifdef EXPENSIVE_CHECKS From 5de7af4b9f05c7a9fb3775f45627b50aba47869b Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Thu, 2 Jan 2025 12:32:42 -0500 Subject: [PATCH 018/480] [llvm][Support][Windows] Fix slash in path for remove_directories (#121448) Before 925471ed903dad871042d7ed0bab89ab6566a564 remove_directories supports path with slash (instead of backslash). The ILCreateFromPathW in new implementation requires backslash path, so the call to remove_directories will fail if the path contains slash. This is to normalize the path to make sure remove_directories still support path with slash as well. --- llvm/lib/Support/Windows/Path.inc | 4 +++- llvm/unittests/Support/Path.cpp | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index 17db114caeb1e..5b311e7c475c5 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1373,9 +1373,11 @@ std::error_code closeFile(file_t &F) { } std::error_code remove_directories(const Twine &path, bool IgnoreErrors) { + SmallString<128> NativePath; + llvm::sys::path::native(path, NativePath, path::Style::windows_backslash); // Convert to utf-16. SmallVector Path16; - std::error_code EC = widenPath(path, Path16); + std::error_code EC = widenPath(NativePath, Path16); if (EC && !IgnoreErrors) return EC; diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp index 8dde2fb50160c..187f47d9cfe07 100644 --- a/llvm/unittests/Support/Path.cpp +++ b/llvm/unittests/Support/Path.cpp @@ -1326,6 +1326,9 @@ TEST_F(FileSystemTest, Remove) { ASSERT_NO_ERROR(fs::remove_directories("D:/footest")); + ASSERT_NO_ERROR(fs::remove_directories(Twine(BaseDir) + "/foo/bar/baz")); + ASSERT_FALSE(fs::exists(Twine(BaseDir) + "/foo/bar/baz")); + ASSERT_NO_ERROR(fs::remove_directories(BaseDir)); ASSERT_FALSE(fs::exists(BaseDir)); } From bca92b12588d63556b749b4627af0112cd2d05c6 Mon Sep 17 00:00:00 2001 From: Angus Lees Date: Fri, 3 Jan 2025 04:33:06 +1100 Subject: [PATCH 019/480] [bazel] Allow SupportTests to be built remotely and cached (#121375) `SupportTests` fails in the bazel macOS sandbox, because `FileSystemTest.permissions` expects to be able to modify file permissions on some otherwise protected files. Previously this test was marked `local` in bazel, which has additional undesirable effects such as skipping remote build and cache. Tighten the bazel tags to just `no-sandbox`. Note in particular, that this allows the test to build, execute, and cache remotely (if configured). Testing: - Verified this test fails (as expected) on macOS with no tags, and passes with `no-sandbox`. - Verified this test passes when executed remotely (using an Engflow RBE setup) with `no-sandbox`. --- utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel index 8a6950facbdf2..d576a9190d09b 100644 --- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel @@ -697,7 +697,7 @@ cc_test( ], linkstatic = 1, tags = [ - "local", # Not compatible with the sandbox on MacOS + "no-sandbox", # FileSystemTest.permissions not compatible with the sandbox on MacOS ], deps = [ "//llvm:AllTargetsCodeGens", From dd30aa83aa12e5b2b5e58cb72ec85070f725df34 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 2 Jan 2025 09:36:01 -0800 Subject: [PATCH 020/480] [RISCV][TTI] Simplify compound check for readability [nfc] (#121504) I misread this check earlier today on a review, so restructure it to be easier to quickly scan. --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 0abb270edcabc..909a64e974255 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2558,8 +2558,10 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; // TODO: Enable expansion when unaligned access is not supported after we fix // issues in ExpandMemcmp. - if (!(ST->enableUnalignedScalarMem() && - (ST->hasStdExtZbb() || ST->hasStdExtZbkb() || IsZeroCmp))) + if (!ST->enableUnalignedScalarMem()) + return Options; + + if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp) return Options; Options.AllowOverlappingLoads = true; From 035e64c0ec02b237a266ebc672718037fdd53eb2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Jan 2025 18:18:55 +0000 Subject: [PATCH 021/480] [VectorCombine] eraseInstruction - ensure we reattempt to fold other users of an erased instruction's operands (REAPPLIED) As we're reducing the use count of the operands its more likely that they will now fold, as they were previously being prevented by a m_OneUse check, or the cost of retaining the extra instruction had been too high. This is necessary for some upcoming patches, although the only change so far is instruction ordering as it allows some SSE folds of 256/512-bit with 128-bit subvectors to occur earlier in foldShuffleToIdentity as the subvector concats are free. Reapplied with a fix for foldSingleElementStore/scalarizeLoadExtract which were replacing/removing memory operations - we need to ensure that the worklist is populated in the correct order so all users of the old memory operations are erased first, so there are no remaining users of the loads when its time to remove them as well. Pulled out of #120984 --- .../Transforms/Vectorize/VectorCombine.cpp | 19 +++++- .../VectorCombine/X86/concat-boolmasks.ll | 64 ++++++++++++++----- 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 8509a31766e35..493ed95b1d22e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -141,10 +141,17 @@ class VectorCombine { void eraseInstruction(Instruction &I) { LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n'); - for (Value *Op : I.operands()) - Worklist.pushValue(Op); + SmallVector Ops(I.operands()); Worklist.remove(&I); I.eraseFromParent(); + + // Push remaining users of the operands and then the operand itself - allows + // further folds that were hindered by OneUse limits. + for (Value *Op : Ops) + if (auto *OpI = dyn_cast(Op)) { + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } }; } // namespace @@ -1337,6 +1344,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { MemoryLocation::get(SI), AA)) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(Load); + if (ScalarizableIdx.isSafeWithFreeze()) ScalarizableIdx.freeze(Builder, *cast(Idx)); Value *GEP = Builder.CreateInBoundsGEP( @@ -1425,6 +1436,10 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (ScalarizedCost >= OriginalCost) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(LI); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast(U); diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll index 057d9af314ba3..c3639baf8b650 100644 --- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll +++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll @@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { -; CHECK-LABEL: @movmsk_i64_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v64i8_v16i8( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX2-LABEL: @movmsk_i64_v64i8_v16i8( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; AVX2-NEXT: ret i64 [[OR]] +; +; AVX512-LABEL: @movmsk_i64_v64i8_v16i8( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; AVX512-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, } define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { -; CHECK-LABEL: @movmsk_i64_v32i32_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v32i32_v4i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX2-LABEL: @movmsk_i64_v32i32_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; AVX2-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; AVX2-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; AVX2-NEXT: ret i64 [[OR]] +; +; AVX512-LABEL: @movmsk_i64_v32i32_v4i32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; AVX512-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; AVX512-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; AVX512-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer From 5236e3dac59e16630a3730c84c2d3d65970a6db3 Mon Sep 17 00:00:00 2001 From: Dominik Adamski Date: Thu, 2 Jan 2025 20:18:55 +0100 Subject: [PATCH 022/480] [Flang][Alias analysis] Fix alias analysis for omp private allocatable item (#120243) Flang alias analysis crashes for omp private allocatable item. The issue is described here : https://github.com/llvm/llvm-project/issues/116954 . We know that private value can't alias with anything else unless it is POINTER or TARGET. That's why we can simplify alias analysis logic. --- .../lib/Optimizer/Analysis/AliasAnalysis.cpp | 36 ++++++------- ...lias-analysis-omp-private-allocatable.mlir | 50 +++++++++++++++++++ 2 files changed, 64 insertions(+), 22 deletions(-) create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 611f212269fb7..e33d8fa333e7a 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -505,30 +505,17 @@ getAttrsFromVariable(fir::FortranVariableOpInterface var) { } template -static Value getPrivateArg(omp::BlockArgOpenMPOpInterface &argIface, - OMPTypeOp &op, DeclTypeOp &declOp) { - Value privateArg; +static bool isPrivateArg(omp::BlockArgOpenMPOpInterface &argIface, + OMPTypeOp &op, DeclTypeOp &declOp) { if (!op.getPrivateSyms().has_value()) - return privateArg; + return false; for (auto [opSym, blockArg] : llvm::zip_equal(*op.getPrivateSyms(), argIface.getPrivateBlockArgs())) { if (blockArg == declOp.getMemref()) { - omp::PrivateClauseOp privateOp = - SymbolTable::lookupNearestSymbolFrom( - op, cast(opSym)); - privateOp.walk([&](omp::YieldOp yieldOp) { - // TODO Extend alias analysis if omp.yield points to - // block argument value - if (!yieldOp.getResults()[0].getDefiningOp()) - return; - llvm::TypeSwitch(yieldOp.getResults()[0].getDefiningOp()) - .template Case( - [&](auto declOp) { privateArg = declOp.getMemref(); }); - }); - return privateArg; + return true; } } - return privateArg; + return false; } AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, @@ -631,6 +618,7 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, breakFromLoop = true; }) .Case([&](auto op) { + bool isPrivateItem = false; if (omp::BlockArgOpenMPOpInterface argIface = dyn_cast(op->getParentOp())) { Value ompValArg; @@ -644,19 +632,18 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, omp::MapInfoOp mapInfo = llvm::cast(opArg.getDefiningOp()); ompValArg = mapInfo.getVarPtr(); - break; + return; } } // If given operation does not reflect mapping item, // check private clause - if (!ompValArg) - ompValArg = getPrivateArg(argIface, targetOp, op); + isPrivateItem = isPrivateArg(argIface, targetOp, op); }) .template Case( [&](auto privateOp) { - ompValArg = getPrivateArg(argIface, privateOp, op); + isPrivateItem = isPrivateArg(argIface, privateOp, op); }); if (ompValArg) { v = ompValArg; @@ -706,6 +693,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, } else { instantiationPoint = op; } + if (isPrivateItem) { + type = SourceKind::Allocate; + breakFromLoop = true; + return; + } // TODO: Look for the fortran attributes present on the operation // Track further through the operand v = op.getMemref(); diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir new file mode 100644 index 0000000000000..5116622364fad --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir @@ -0,0 +1,50 @@ +// Use --mlir-disable-threading so that the AA queries are serialized +// as well as its diagnostic output. +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s + +// Fortran code before simplification: +// SUBROUTINE mysub(ns,ne) +// INTEGER :: n +// REAL(KIND=8), DIMENSION(:), allocatable :: ar1 +// real(kind=8), dimension(20) :: ar2 +// REAL(KIND=8), DIMENSION(20) :: d +// +//!$OMP parallel PRIVATE(ar1) +// d(1:1) = (/(DOT_PRODUCT(ar1(1:n), ar2(1:n)),n=1, 1)/) +//!$OMP END parallel +// END SUBROUTINE + +// CHECK-LABEL: Testing : "testPrivateAllocatable" +// CHECK: ar2#0 <-> ar1#0: NoAlias +// CHECK: ar2#1 <-> ar1#0: NoAlias +// CHECK: ar2#0 <-> ar1#1: NoAlias +// CHECK: ar2#1 <-> ar1#1: NoAlias + +omp.private {type = private} @_QFmysubEar1_private_ref_box_heap_Uxf64 : !fir.ref>>> alloc { +^bb0(%arg0: !fir.ref>>>): + %0 = fir.alloca !fir.box>> {bindc_name = "ar1", pinned, uniq_name = "_QFmysubEar1"} + %5:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFmysubEar1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + omp.yield(%5#0 : !fir.ref>>>) +} dealloc { +^bb0(%arg0: !fir.ref>>>): + omp.yield +} +func.func @testPrivateAllocatable(%arg0: !fir.ref {fir.bindc_name = "ns"}, %arg1: !fir.ref {fir.bindc_name = "ne"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.alloca !fir.box>> {bindc_name = "ar1", uniq_name = "_QFmysubEar1"} + %2 = fir.zero_bits !fir.heap> + %c0 = arith.constant 0 : index + %3 = fir.shape %c0 : (index) -> !fir.shape<1> + %4 = fir.embox %2(%3) : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + fir.store %4 to %1 : !fir.ref>>> + %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFmysubEar1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %c20 = arith.constant 20 : index + %6 = fir.alloca !fir.array<20xf64> {bindc_name = "ar2", uniq_name = "_QFmysubEar2"} + %7 = fir.shape %c20 : (index) -> !fir.shape<1> + %8:2 = hlfir.declare %6(%7) {uniq_name = "_QFmysubEar2", test.ptr="ar2" } : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + omp.parallel private(@_QFmysubEar1_private_ref_box_heap_Uxf64 %5#0 -> %arg2 : !fir.ref>>>) { + %20:2 = hlfir.declare %arg2 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFmysubEar1", test.ptr = "ar1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + omp.terminator + } + return +} From 5f5792aedb1f8088836ccd1c0a924c5e0bbf35db Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 2 Jan 2025 20:10:45 +0000 Subject: [PATCH 023/480] [VPlan] Use removeDeadRecipes in optimizeForVFAndUF (NFCI) Split off from https://github.com/llvm/llvm-project/pull/108378. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 89aab71905a29..8ac2bd5160c26 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -842,11 +842,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, VPInstruction::BranchOnCond, {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); - SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); ExitingVPBB->appendRecipe(BOC); + + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible From 3a423a10ff83684332195b5191b16f12c81985ba Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 2 Jan 2025 12:11:59 -0800 Subject: [PATCH 024/480] [MemProf][PGO] Prevent dropping of profile metadata during optimization (#121359) This patch fixes a couple of places where memprof-related metadata (!memprof and !callsite) were being dropped, and one place where PGO metadata (!prof) was being dropped. All were due to instances of combineMetadata() being invoked. That function drops all metadata not in the list provided by the client, and also drops any not in its switch statement. Memprof metadata needed a case in the combineMetadata switch statement. For now we simply keep the metadata of the instruction being kept, which doesn't retain all the profile information when two calls with memprof metadata are being combined, but at least retains some. For the memprof metadata being dropped during call CSE, add memprof and callsite metadata to the list of known ids in combineMetadataForCSE. Neither memprof nor regular prof metadata were in the list of known ids for the callsite in MemCpyOptimizer, which was added to combine AA metadata after optimization of byval arguments fed by memcpy instructions, and similar types of optimizations of memcpy uses. There is one other callsite of combineMetadata, but it is only invoked on load instructions, which do not carry these types of metadata. --- llvm/include/llvm/IR/Metadata.h | 2 + llvm/include/llvm/Transforms/Utils/Local.h | 5 ++ llvm/lib/Analysis/MemoryProfileInfo.cpp | 17 +++++++ .../Transforms/InstCombine/InstCombinePHI.cpp | 3 ++ .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 12 +++-- llvm/lib/Transforms/Utils/Local.cpp | 17 ++++++- llvm/test/Transforms/MemCpyOpt/memcpy.ll | 18 +++++++ .../SimplifyCFG/merge-calls-memprof.ll | 51 +++++++++++++++++++ 8 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h index 35580f3f38c61..df2384c5f6e69 100644 --- a/llvm/include/llvm/IR/Metadata.h +++ b/llvm/include/llvm/IR/Metadata.h @@ -1464,6 +1464,8 @@ class MDNode : public Metadata { static MDNode *getMergedProfMetadata(MDNode *A, MDNode *B, const Instruction *AInstr, const Instruction *BInstr); + static MDNode *getMergedMemProfMetadata(MDNode *A, MDNode *B); + static MDNode *getMergedCallsiteMetadata(MDNode *A, MDNode *B); }; /// Tuple of metadata. diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index bbf29e6f46b47..40c448593807b 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -412,6 +412,11 @@ Instruction *removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU = nullptr); bool removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr, MemorySSAUpdater *MSSAU = nullptr); +/// DO NOT CALL EXTERNALLY. +/// FIXME: https://github.com/llvm/llvm-project/issues/121495 +/// Once external callers of this function are removed, either inline into +/// combineMetadataForCSE, or internalize and remove KnownIDs parameter. +/// /// Combine the metadata of two instructions so that K can replace J. Some /// metadata kinds can only be kept if K does not move, meaning it dominated /// J in the original IR. diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 1c3f589e84941..2f3c87a89f9f9 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -347,3 +347,20 @@ template <> uint64_t CallStack::back() const { return mdconst::dyn_extract(N->operands().back()) ->getZExtValue(); } + +MDNode *MDNode::getMergedMemProfMetadata(MDNode *A, MDNode *B) { + // TODO: Support more sophisticated merging, such as selecting the one with + // more bytes allocated, or implement support for carrying multiple allocation + // leaf contexts. For now, keep the first one. + if (A) + return A; + return B; +} + +MDNode *MDNode::getMergedCallsiteMetadata(MDNode *A, MDNode *B) { + // TODO: Support more sophisticated merging, which will require support for + // carrying multiple contexts. For now, keep the first one. + if (A) + return A; + return B; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 1fcf1c570adda..272a1942c3350 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -788,6 +788,9 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { BasicBlock *BB = std::get<0>(Incoming); Value *V = std::get<1>(Incoming); LoadInst *LI = cast(V); + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs + // doesn't need to be maintained here. combineMetadata(NewLI, LI, KnownIDs, true); Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index bb98b3d1c0725..5f7cb92d239bc 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -345,10 +345,14 @@ static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA, static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_invariant_group, - LLVMContext::MD_access_group}; + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_invariant_group, + LLVMContext::MD_access_group, LLVMContext::MD_prof, + LLVMContext::MD_memprof, LLVMContext::MD_callsite}; + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Use custom AA metadata combining handling instead of combineMetadata, which + // is meant for CSE and will drop any metadata not in the KnownIDs list. combineMetadata(ReplInst, I, KnownIDs, true); } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index a3af96d5af026..1e4061cb0771e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3308,6 +3308,9 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, return Changed; } +// FIXME: https://github.com/llvm/llvm-project/issues/121495 +// Once external callers of this function are removed, either inline into +// combineMetadataForCSE, or internalize and remove KnownIDs parameter. void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef KnownIDs, bool DoesKMove) { SmallVector, 4> Metadata; @@ -3320,6 +3323,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, switch (Kind) { default: + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Change to removing only explicitly listed other metadata, and assert + // on unknown metadata, to avoid inadvertently dropping newly added + // metadata types. K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: @@ -3379,6 +3386,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; + case LLVMContext::MD_memprof: + K->setMetadata(Kind, MDNode::getMergedMemProfMetadata(KMD, JMD)); + break; + case LLVMContext::MD_callsite: + K->setMetadata(Kind, MDNode::getMergedCallsiteMetadata(KMD, JMD)); + break; case LLVMContext::MD_preserve_access_index: // Preserve !preserve.access.index in K. break; @@ -3442,7 +3455,9 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, LLVMContext::MD_nontemporal, LLVMContext::MD_noundef, LLVMContext::MD_mmra, - LLVMContext::MD_noalias_addrspace}; + LLVMContext::MD_noalias_addrspace, + LLVMContext::MD_memprof, + LLVMContext::MD_callsite}; combineMetadata(K, J, KnownIDs, KDominatesJ); } diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll index 39b90adc74ef3..65d78f4199aa0 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -803,6 +803,19 @@ define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) { ret void } +define void @byval_param_profile_metadata(ptr align 4 byval(i32) %ptr) { +; CHECK-LABEL: @byval_param_profile_metadata( +; CHECK-NEXT: store i32 1, ptr [[PTR2:%.*]], align 4 +; CHECK-NEXT: call void @f_byval(ptr byval(i32) align 4 [[PTR2]]), !prof [[PROF3:![0-9]+]], !memprof [[META4:![0-9]+]], !callsite [[META7:![0-9]+]] +; CHECK-NEXT: ret void +; + %tmp = alloca i32, align 4 + store i32 1, ptr %ptr + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %ptr, i64 4, i1 false) + call void @f_byval(ptr align 4 byval(i32) %tmp), !memprof !3, !callsite !6, !prof !7 + ret void +} + define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) { ; CHECK-LABEL: @memcpy_memory_none( ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[SIZE:%.*]], i1 false) #[[ATTR7:[0-9]+]] @@ -897,3 +910,8 @@ define void @memcpy_immut_escape_after(ptr align 4 noalias %val) { !0 = !{!0} !1 = !{!1, !0} !2 = !{!1} +!3 = !{!4} +!4 = !{!5, !"cold"} +!5 = !{i64 123, i64 456} +!6 = !{i64 123} +!7 = !{!"branch_weights", i32 10} diff --git a/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll new file mode 100644 index 0000000000000..10c6aeb26ba76 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 + +;; Test to ensure that memprof related metadata is not dropped when +;; instructions are combined. Currently the metadata from the first instruction +;; is kept, which prevents full loss of profile context information. + +; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local noundef nonnull ptr @_Z4testb(i1 noundef zeroext %b) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local noundef nonnull ptr @_Z4testb( +; CHECK-SAME: i1 noundef zeroext [[B:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof [[META0:![0-9]+]], !callsite [[META3:![0-9]+]] +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !0, !callsite !3 + br label %if.end + +if.else: ; preds = %entry + %call1 = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !4, !callsite !7 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + + +declare ptr @_Znwm(i64) nounwind readonly + +!0 = !{!1} +!1 = !{!2, !"notcold"} +!2 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434} +!3 = !{i64 -852997907418798798} +!4 = !{!5} +!5 = !{!6, !"cold"} +!6 = !{i64 123, i64 -2101080423462424381, i64 5188446645037944434} +!7 = !{i64 123} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], !"notcold"} +; CHECK: [[META2]] = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434} +; CHECK: [[META3]] = !{i64 -852997907418798798} +;. From cd19f3f787b01481fd687834457686e16fffdbe6 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Fri, 3 Jan 2025 05:18:33 +0900 Subject: [PATCH 025/480] [Driver][clang-linker-wrapper] Add initial support for OpenMP offloading to generic SPIR-V (#120145) This is the first of a series of patches to add support for OpenMP offloading to SPIR-V through liboffload with the first intended target being Intel GPUs. This patch implements the basic driver and `clang-linker-wrapper` work for JIT mode. There are still many missing pieces, so this is not yet usable. We introduce `spirv64-intel-unknown` as the only currently supported triple. The user-facing argument to enable offloading will be `-fopenmp -fopenmp-targets=spirv64-intel` Add a new `SPIRVOpenMPToolChain` toolchain based on the existing general SPIR-V toolchain which will call all the required SPIR-V tools (and eventually the SPIR-V backend) as well as add the corresponding device RTL as an argument to the linker. We can't get through the front end consistently yet, so it's difficult to add any LIT tests that execute any tools, but front end changes are planned very shortly, and then we can add those tests. --------- Signed-off-by: Sarnie, Nick --- clang/include/clang/Driver/Options.td | 2 + clang/lib/Driver/CMakeLists.txt | 1 + clang/lib/Driver/Driver.cpp | 12 ++-- clang/lib/Driver/ToolChains/CommonArgs.cpp | 9 ++- clang/lib/Driver/ToolChains/SPIRV.h | 2 +- clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp | 34 ++++++++++ clang/lib/Driver/ToolChains/SPIRVOpenMP.h | 29 +++++++++ clang/lib/Frontend/CompilerInvocation.cpp | 1 + .../spirv-openmp/lib/libomptarget-spirv64.bc | 0 clang/test/Driver/spirv-openmp-toolchain.c | 64 +++++++++++++++++++ .../ClangLinkerWrapper.cpp | 5 +- 11 files changed, 149 insertions(+), 10 deletions(-) create mode 100644 clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp create mode 100644 clang/lib/Driver/ToolChains/SPIRVOpenMP.h create mode 100644 clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc create mode 100644 clang/test/Driver/spirv-openmp-toolchain.c diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d922709db1778..523761f5e0d80 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1493,6 +1493,8 @@ def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path HelpText<"Path to libomptarget-amdgcn bitcode library">, Alias; def libomptarget_nvptx_bc_path_EQ : Joined<["--"], "libomptarget-nvptx-bc-path=">, Group, HelpText<"Path to libomptarget-nvptx bitcode library">; +def libomptarget_spirv_bc_path_EQ : Joined<["--"], "libomptarget-spirv-bc-path=">, Group, + HelpText<"Path to libomptarget-spirv bitcode library">; def dD : Flag<["-"], "dD">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; def dI : Flag<["-"], "dI">, Group, Visibility<[ClangOption, CC1Option]>, diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 4fd10bf671512..57d04c3fefa84 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -77,6 +77,7 @@ add_clang_library(clangDriver ToolChains/RISCVToolchain.cpp ToolChains/Solaris.cpp ToolChains/SPIRV.cpp + ToolChains/SPIRVOpenMP.cpp ToolChains/TCE.cpp ToolChains/UEFI.cpp ToolChains/VEToolchain.cpp diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index dc84c1b9d1cc4..bc5ce9f14ab69 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -43,6 +43,7 @@ #include "ToolChains/PS4CPU.h" #include "ToolChains/RISCVToolchain.h" #include "ToolChains/SPIRV.h" +#include "ToolChains/SPIRVOpenMP.h" #include "ToolChains/Solaris.h" #include "ToolChains/TCE.h" #include "ToolChains/UEFI.h" @@ -890,9 +891,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, HostTC->getTriple()); // Attempt to deduce the offloading triple from the set of architectures. - // We can only correctly deduce NVPTX / AMDGPU triples currently. We need - // to temporarily create these toolchains so that we can access tools for - // inferring architectures. + // We can only correctly deduce NVPTX / AMDGPU triples currently. + // We need to temporarily create these toolchains so that we can access + // tools for inferring architectures. llvm::DenseSet Archs; if (NVPTXTriple) { auto TempTC = std::make_unique( @@ -962,7 +963,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, const ToolChain *TC; // Device toolchains have to be selected differently. They pair host // and device in their implementation. - if (TT.isNVPTX() || TT.isAMDGCN()) { + if (TT.isNVPTX() || TT.isAMDGCN() || TT.isSPIRV()) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "Host toolchain should be always defined."); @@ -975,6 +976,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, else if (TT.isAMDGCN()) DeviceTC = std::make_unique( *this, TT, *HostTC, C.getInputArgs()); + else if (TT.isSPIRV()) + DeviceTC = std::make_unique( + *this, TT, *HostTC, C.getInputArgs()); else assert(DeviceTC && "Device toolchain not defined."); } diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 8b9639061d543..60214c4d59cee 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -2839,10 +2839,13 @@ void tools::addOpenMPDeviceRTL(const Driver &D, LibraryPaths.emplace_back(LibPath); OptSpecifier LibomptargetBCPathOpt = - Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_bc_path_EQ - : options::OPT_libomptarget_nvptx_bc_path_EQ; + Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_bc_path_EQ + : Triple.isNVPTX() ? options::OPT_libomptarget_nvptx_bc_path_EQ + : options::OPT_libomptarget_spirv_bc_path_EQ; - StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx"; + StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" + : Triple.isNVPTX() ? "nvptx" + : "spirv64"; std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str(); // First check whether user specifies bc library diff --git a/clang/lib/Driver/ToolChains/SPIRV.h b/clang/lib/Driver/ToolChains/SPIRV.h index d59a8c76ed473..415f639bba3ec 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.h +++ b/clang/lib/Driver/ToolChains/SPIRV.h @@ -52,7 +52,7 @@ class LLVM_LIBRARY_VISIBILITY Linker final : public Tool { namespace toolchains { -class LLVM_LIBRARY_VISIBILITY SPIRVToolChain final : public ToolChain { +class LLVM_LIBRARY_VISIBILITY SPIRVToolChain : public ToolChain { mutable std::unique_ptr Translator; public: diff --git a/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp b/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp new file mode 100644 index 0000000000000..1f27245e2839c --- /dev/null +++ b/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp @@ -0,0 +1,34 @@ +//==- SPIRVOpenMP.cpp - SPIR-V OpenMP Tool Implementations --------*- C++ -*==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//==------------------------------------------------------------------------==// +#include "SPIRVOpenMP.h" +#include "CommonArgs.h" + +using namespace clang::driver; +using namespace clang::driver::toolchains; +using namespace clang::driver::tools; +using namespace llvm::opt; + +namespace clang::driver::toolchains { +SPIRVOpenMPToolChain::SPIRVOpenMPToolChain(const Driver &D, + const llvm::Triple &Triple, + const ToolChain &HostToolchain, + const ArgList &Args) + : SPIRVToolChain(D, Triple, Args), HostTC(HostToolchain) {} + +void SPIRVOpenMPToolChain::addClangTargetOptions( + const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const { + + if (DeviceOffloadingKind != Action::OFK_OpenMP) + return; + + if (DriverArgs.hasArg(options::OPT_nogpulib)) + return; + addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, "", getTriple(), HostTC); +} +} // namespace clang::driver::toolchains diff --git a/clang/lib/Driver/ToolChains/SPIRVOpenMP.h b/clang/lib/Driver/ToolChains/SPIRVOpenMP.h new file mode 100644 index 0000000000000..64404e2a28210 --- /dev/null +++ b/clang/lib/Driver/ToolChains/SPIRVOpenMP.h @@ -0,0 +1,29 @@ +//===--- SPIRVOpenMP.h - SPIR-V OpenMP Tool Implementations ------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SPIRV_OPENMP_H +#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SPIRV_OPENMP_H + +#include "SPIRV.h" +#include "clang/Driver/Tool.h" +#include "clang/Driver/ToolChain.h" + +namespace clang::driver::toolchains { +class LLVM_LIBRARY_VISIBILITY SPIRVOpenMPToolChain : public SPIRVToolChain { +public: + SPIRVOpenMPToolChain(const Driver &D, const llvm::Triple &Triple, + const ToolChain &HostTC, const llvm::opt::ArgList &Args); + + void addClangTargetOptions( + const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const override; + + const ToolChain &HostTC; +}; +} // namespace clang::driver::toolchains +#endif diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 348c56cc37da3..0ae6dce5dd40a 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4263,6 +4263,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, if (TT.getArch() == llvm::Triple::UnknownArch || !(TT.getArch() == llvm::Triple::aarch64 || TT.isPPC() || + TT.getArch() == llvm::Triple::spirv64 || TT.getArch() == llvm::Triple::systemz || TT.getArch() == llvm::Triple::loongarch64 || TT.getArch() == llvm::Triple::nvptx || diff --git a/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc b/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c new file mode 100644 index 0000000000000..3eb1f22a03ed0 --- /dev/null +++ b/clang/test/Driver/spirv-openmp-toolchain.c @@ -0,0 +1,64 @@ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \ +// RUN: --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \ +// RUN: | FileCheck %s + +// verify the tools invocations +// CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c" +// CHECK: "-cc1" "-triple" "spirv64-intel" "-aux-triple" "x86_64-unknown-linux-gnu" +// CHECK: llvm-spirv{{.*}} +// CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-obj" +// CHECK: clang-linker-wrapper{{.*}} "-o" "a.out" + +// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-PHASES %s + +// CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp) +// CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp) +// CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp) +// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp) +// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp) +// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp) +// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (spirv64-intel)" {5}, ir +// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp) +// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp) +// CHECK-PHASES: 9: offload, "device-openmp (spirv64-intel)" {8}, object +// CHECK-PHASES: 10: clang-offload-packager, {9}, image, (device-openmp) +// CHECK-PHASES: 11: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {10}, ir +// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp) +// CHECK-PHASES: 13: assembler, {12}, object, (host-openmp) +// CHECK-PHASES: 14: clang-linker-wrapper, {13}, image, (host-openmp) + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS + +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" +// CHECK-BINDINGS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]" +// CHECK-BINDINGS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_SPV:.+]]" +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]" +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_OBJ:.+]]" +// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_PP:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_PP]]"], output: "[[HOST_BC:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]"], output: "[[DEVICE_PP:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[DEVICE_PP]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_ASM:.+]]" +// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_SPV:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_ASM:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang::as", inputs: ["[[HOST_ASM]]"], output: "[[HOST_OBJ:.+]]" +// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR +// CHECK-EMIT-LLVM-IR: "-cc1" "-triple" "spirv64-intel"{{.*}}"-emit-llvm-bc" + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \ +// RUN: --sysroot=%S/Inputs/spirv-openmp/ %s 2>&1 | FileCheck --check-prefix=CHECK-GPULIB %s +// CHECK-GPULIB: "-cc1" "-triple" "spirv64-intel"{{.*}}"-mlink-builtin-bitcode" "{{.*}}libomptarget-spirv64.bc" + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp --offload-arch=spirv64-intel \ +// RUN: --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-OFFLOAD-ARCH-ERROR +// CHECK-OFFLOAD-ARCH-ERROR: error: failed to deduce triple for target architecture 'spirv64-intel'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 4201f043944ed..9fba63b195bc1 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -504,14 +504,14 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { {"-Xlinker", Args.MakeArgString("--plugin-opt=" + StringRef(Arg->getValue()))}); - if (!Triple.isNVPTX()) + if (!Triple.isNVPTX() && !Triple.isSPIRV()) CmdArgs.push_back("-Wl,--no-undefined"); for (StringRef InputFile : InputFiles) CmdArgs.push_back(InputFile); // If this is CPU offloading we copy the input libraries. - if (!Triple.isAMDGPU() && !Triple.isNVPTX()) { + if (!Triple.isAMDGPU() && !Triple.isNVPTX() && !Triple.isSPIRV()) { CmdArgs.push_back("-Wl,-Bsymbolic"); CmdArgs.push_back("-shared"); ArgStringList LinkerArgs; @@ -595,6 +595,7 @@ Expected linkDevice(ArrayRef InputFiles, case Triple::aarch64_be: case Triple::ppc64: case Triple::ppc64le: + case Triple::spirv64: case Triple::systemz: case Triple::loongarch64: return generic::clang(InputFiles, Args); From 4922350407127607a9e78fc6d19f3f6278b1e46b Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 2 Jan 2025 20:18:56 +0000 Subject: [PATCH 026/480] [gn build] Port cd19f3f787b0 --- llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn index 615c11b6b8d62..700c243864633 100644 --- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -94,6 +94,7 @@ static_library("Driver") { "ToolChains/PS4CPU.cpp", "ToolChains/RISCVToolchain.cpp", "ToolChains/SPIRV.cpp", + "ToolChains/SPIRVOpenMP.cpp", "ToolChains/Solaris.cpp", "ToolChains/TCE.cpp", "ToolChains/UEFI.cpp", From f03b100e93196ca1ecec20fde3fc48690b3dad7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=87=E9=80=B8?= Date: Fri, 3 Jan 2025 07:08:58 +0900 Subject: [PATCH 027/480] [Cygwin] Fix global variable dll import (#121439) This PR is necessary for cygwin target of Rust. References: * https://github.com/rust-lang/llvm-project/commit/86657cc39f8e42ae73be810fb0703ddac0eeef94 * https://github.com/Berrysoft/llvm-project/commit/a807e9f077351d3c6a68f4abe74c94a039759a2e --- llvm/lib/Target/TargetMachine.cpp | 2 +- llvm/test/CodeGen/X86/mingw-refptr.ll | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index c0985f3be91a5..d5365f3c04743 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -204,7 +204,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const GlobalValue *GV) const { // don't assume the variables to be DSO local unless we actually know // that for sure. This only has to be done for variables; for functions // the linker can insert thunks for calling functions from another DLL. - if (TT.isWindowsGNUEnvironment() && GV->isDeclarationForLinker() && + if (TT.isOSCygMing() && GV->isDeclarationForLinker() && isa(GV)) return false; diff --git a/llvm/test/CodeGen/X86/mingw-refptr.ll b/llvm/test/CodeGen/X86/mingw-refptr.ll index 73f1a9880913c..82a90aba38654 100644 --- a/llvm/test/CodeGen/X86/mingw-refptr.ll +++ b/llvm/test/CodeGen/X86/mingw-refptr.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X64 +; RUN: llc < %s -mtriple=x86_64-pc-cygwin | FileCheck %s -check-prefix=CHECK-X64 ; RUN: llc < %s -mtriple=i686-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X86 ; RUN: llc < %s -mtriple=i686-w64-mingw32-none-elf | FileCheck %s -check-prefix=CHECK-X86-ELF From f6cb56902c6dcafede21eb6662910b6ff661fc0f Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 2 Jan 2025 23:22:20 +0100 Subject: [PATCH 028/480] [llvm-(min-)tblgen] Avoid redundant source compilation (#114494) All the sources of `llvm-min-tblgen` are also used for `llvm-tblgen`, with identical compilation flags. Reuse the object files of `llvm-min-tblgen` for `llvm-tblgen` by applying the usual source structure of an executable: One file per executable which named after the executable name containing the (in this case trivial) main function, which just calls the tblgen_main in TableGen.cpp. This should also clear up any confusion (including mine) of where each executable's main function is. While this slightly reduces build time, the main motivation is ccache. Using the hard_link option, building the object files for `llvm-tblgen` will result in a hard link to the same object file already used for `llvm-min-tblgen`. To signal the build system that the file is new, ccache will update the file's time stamp. Unfortunately, time stamps are shared between all hard-linked files s.t. this will indirectly also update the time stamps for the object files used for `llvm-tblgen`. At the next run, Ninja will recognize this time stamp discrepancy to the expected stamp recorded in `.ninja_log` and rebuild those object files for `llvm-min-tblgen`, which again will also update the stamp for the `llvm-tblgen`... . This is especially annoying for tablegen because it means Ninja will re-run all tablegenning in every build. I am using the hard_link option because it reduces the cost of having multiple build-trees of the LLVM sources and reduces the wear to the SSD they are stored on. --- .../{ => Basic}/ARMTargetDefEmitter.cpp | 0 .../utils/TableGen/{ => Basic}/Attributes.cpp | 0 llvm/utils/TableGen/Basic/CMakeLists.txt | 7 ++++++ .../TableGen/{ => Basic}/DirectiveEmitter.cpp | 0 .../TableGen/{ => Basic}/IntrinsicEmitter.cpp | 4 +-- .../{ => Basic}/RISCVTargetDefEmitter.cpp | 0 llvm/utils/TableGen/{ => Basic}/TableGen.cpp | 6 +++-- llvm/utils/TableGen/Basic/TableGen.h | 13 ++++++++++ llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp | 0 llvm/utils/TableGen/CMakeLists.txt | 25 +++++++------------ llvm/utils/TableGen/llvm-min-tblgen.cpp | 18 +++++++++++++ llvm/utils/TableGen/llvm-tblgen.cpp | 18 +++++++++++++ 12 files changed, 71 insertions(+), 20 deletions(-) rename llvm/utils/TableGen/{ => Basic}/ARMTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/Attributes.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/DirectiveEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/IntrinsicEmitter.cpp (99%) rename llvm/utils/TableGen/{ => Basic}/RISCVTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/TableGen.cpp (94%) create mode 100644 llvm/utils/TableGen/Basic/TableGen.h rename llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp (100%) create mode 100644 llvm/utils/TableGen/llvm-min-tblgen.cpp create mode 100644 llvm/utils/TableGen/llvm-tblgen.cpp diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/ARMTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Basic/Attributes.cpp similarity index 100% rename from llvm/utils/TableGen/Attributes.cpp rename to llvm/utils/TableGen/Basic/Attributes.cpp diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt index 41d737e8d418e..b058fba78eb05 100644 --- a/llvm/utils/TableGen/Basic/CMakeLists.txt +++ b/llvm/utils/TableGen/Basic/CMakeLists.txt @@ -9,8 +9,15 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB + ARMTargetDefEmitter.cpp + Attributes.cpp CodeGenIntrinsics.cpp + DirectiveEmitter.cpp + IntrinsicEmitter.cpp + RISCVTargetDefEmitter.cpp SDNodeProperties.cpp + TableGen.cpp + VTEmitter.cpp ) # Users may include its headers as "Basic/*.h" diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/DirectiveEmitter.cpp rename to llvm/utils/TableGen/Basic/DirectiveEmitter.cpp diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp similarity index 99% rename from llvm/utils/TableGen/IntrinsicEmitter.cpp rename to llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 093602c3da804..fc2b8908a35b8 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "Basic/CodeGenIntrinsics.h" -#include "Basic/SequenceToOffsetTable.h" +#include "CodeGenIntrinsics.h" +#include "SequenceToOffsetTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/RISCVTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp similarity index 94% rename from llvm/utils/TableGen/TableGen.cpp rename to llvm/utils/TableGen/Basic/TableGen.cpp index bea2a2e735dbe..80ac93f2b54fb 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/Basic/TableGen.cpp @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This file contains the main function for LLVM's TableGen. +// This file contains the global defintions (mostly command line parameters) +// shared between llvm-tblgen and llvm-min-tblgen. // //===----------------------------------------------------------------------===// +#include "TableGen.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -74,7 +76,7 @@ static TableGen::Emitter::Opt X[] = { {"print-sets", printSets, "Print expanded sets for testing DAG exprs"}, }; -int main(int argc, char **argv) { +int tblgen_main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h new file mode 100644 index 0000000000000..630aea62fcf90 --- /dev/null +++ b/llvm/utils/TableGen/Basic/TableGen.h @@ -0,0 +1,13 @@ +//===- TableGen.h ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shared entry point for llvm-tblgen and llvm-min-tblgen. +// +//===----------------------------------------------------------------------===// + +int tblgen_main(int argc, char **argv); diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/Basic/VTEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/VTEmitter.cpp rename to llvm/utils/TableGen/Basic/VTEmitter.cpp diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index ba1e4aa01b48d..e4b686803c976 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -5,20 +5,17 @@ add_subdirectory(Basic) # code needed by the backends. add_subdirectory(Common) -set(LLVM_LINK_COMPONENTS Support) - # llvm-min-tablegen only contains a subset of backends necessary to # build llvm/include. It must not depend on TableGenCommon, as # TableGenCommon depends on this already to generate things such as # ValueType definitions. +# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included +# into LLVMTableGenBasic to avoid redundant compilation and problems with build +# caches. +# At least one source file must be included directly to avoid CMake problems. +# E.g. CMake derives which linker to use from the types of sources added. add_tablegen(llvm-min-tblgen LLVM_HEADERS - TableGen.cpp - ARMTargetDefEmitter.cpp - Attributes.cpp - DirectiveEmitter.cpp - IntrinsicEmitter.cpp - RISCVTargetDefEmitter.cpp - VTEmitter.cpp + llvm-min-tblgen.cpp $ PARTIAL_SOURCES_INTENDED @@ -32,10 +29,8 @@ set(LLVM_LINK_COMPONENTS add_tablegen(llvm-tblgen LLVM DESTINATION "${LLVM_TOOLS_INSTALL_DIR}" EXPORT LLVM - ARMTargetDefEmitter.cpp AsmMatcherEmitter.cpp AsmWriterEmitter.cpp - Attributes.cpp CallingConvEmitter.cpp CodeEmitterGen.cpp CodeGenMapTable.cpp @@ -48,7 +43,6 @@ add_tablegen(llvm-tblgen LLVM DecoderEmitter.cpp DFAEmitter.cpp DFAPacketizerEmitter.cpp - DirectiveEmitter.cpp DisassemblerEmitter.cpp DXILEmitter.cpp ExegesisEmitter.cpp @@ -57,18 +51,15 @@ add_tablegen(llvm-tblgen LLVM GlobalISelEmitter.cpp InstrDocsEmitter.cpp InstrInfoEmitter.cpp - IntrinsicEmitter.cpp + llvm-tblgen.cpp MacroFusionPredicatorEmitter.cpp OptionParserEmitter.cpp OptionRSTEmitter.cpp PseudoLoweringEmitter.cpp RegisterBankEmitter.cpp RegisterInfoEmitter.cpp - RISCVTargetDefEmitter.cpp SearchableTableEmitter.cpp SubtargetEmitter.cpp - TableGen.cpp - VTEmitter.cpp WebAssemblyDisassemblerEmitter.cpp X86InstrMappingEmitter.cpp X86DisassemblerTables.cpp @@ -79,6 +70,8 @@ add_tablegen(llvm-tblgen LLVM $ $ + PARTIAL_SOURCES_INTENDED + DEPENDS intrinsics_gen # via llvm-min-tablegen ) diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp new file mode 100644 index 0000000000000..79fce5c555f6e --- /dev/null +++ b/llvm/utils/TableGen/llvm-min-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-min-tblgen.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp new file mode 100644 index 0000000000000..a38382472a992 --- /dev/null +++ b/llvm/utils/TableGen/llvm-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-tblgen.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } From 06b6161d3fa9d69a07e9046dbdd2e230b257d948 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 2 Jan 2025 23:27:57 +0100 Subject: [PATCH 029/480] Revert "[llvm-(min-)tblgen] Avoid redundant source compilation (#114494)" This reverts commit f6cb56902c6dcafede21eb6662910b6ff661fc0f. Buildbot failures such as https://lab.llvm.org/buildbot/#/builders/89/builds/13541: ``` /usr/bin/ld: utils/TableGen/Basic/CMakeFiles/obj.LLVMTableGenBasic.dir/ARMTargetDefEmitter.cpp.o: undefined reference to symbol '_ZN4llvm23EnableABIBreakingChecksE' /usr/bin/ld: /home/tcwg-buildbot/worker/flang-aarch64-libcxx/build/./lib/libLLVMSupport.so.20.0git: error adding symbols: DSO missing from command line ``` Going to investigate. --- .../{Basic => }/ARMTargetDefEmitter.cpp | 0 .../utils/TableGen/{Basic => }/Attributes.cpp | 0 llvm/utils/TableGen/Basic/CMakeLists.txt | 7 ------ llvm/utils/TableGen/Basic/TableGen.h | 13 ---------- llvm/utils/TableGen/CMakeLists.txt | 25 ++++++++++++------- .../TableGen/{Basic => }/DirectiveEmitter.cpp | 0 .../TableGen/{Basic => }/IntrinsicEmitter.cpp | 4 +-- .../{Basic => }/RISCVTargetDefEmitter.cpp | 0 llvm/utils/TableGen/{Basic => }/TableGen.cpp | 6 ++--- llvm/utils/TableGen/{Basic => }/VTEmitter.cpp | 0 llvm/utils/TableGen/llvm-min-tblgen.cpp | 18 ------------- llvm/utils/TableGen/llvm-tblgen.cpp | 18 ------------- 12 files changed, 20 insertions(+), 71 deletions(-) rename llvm/utils/TableGen/{Basic => }/ARMTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{Basic => }/Attributes.cpp (100%) delete mode 100644 llvm/utils/TableGen/Basic/TableGen.h rename llvm/utils/TableGen/{Basic => }/DirectiveEmitter.cpp (100%) rename llvm/utils/TableGen/{Basic => }/IntrinsicEmitter.cpp (99%) rename llvm/utils/TableGen/{Basic => }/RISCVTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{Basic => }/TableGen.cpp (94%) rename llvm/utils/TableGen/{Basic => }/VTEmitter.cpp (100%) delete mode 100644 llvm/utils/TableGen/llvm-min-tblgen.cpp delete mode 100644 llvm/utils/TableGen/llvm-tblgen.cpp diff --git a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp rename to llvm/utils/TableGen/ARMTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Basic/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/Attributes.cpp rename to llvm/utils/TableGen/Attributes.cpp diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt index b058fba78eb05..41d737e8d418e 100644 --- a/llvm/utils/TableGen/Basic/CMakeLists.txt +++ b/llvm/utils/TableGen/Basic/CMakeLists.txt @@ -9,15 +9,8 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB - ARMTargetDefEmitter.cpp - Attributes.cpp CodeGenIntrinsics.cpp - DirectiveEmitter.cpp - IntrinsicEmitter.cpp - RISCVTargetDefEmitter.cpp SDNodeProperties.cpp - TableGen.cpp - VTEmitter.cpp ) # Users may include its headers as "Basic/*.h" diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h deleted file mode 100644 index 630aea62fcf90..0000000000000 --- a/llvm/utils/TableGen/Basic/TableGen.h +++ /dev/null @@ -1,13 +0,0 @@ -//===- TableGen.h ---------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Shared entry point for llvm-tblgen and llvm-min-tblgen. -// -//===----------------------------------------------------------------------===// - -int tblgen_main(int argc, char **argv); diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index e4b686803c976..ba1e4aa01b48d 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -5,17 +5,20 @@ add_subdirectory(Basic) # code needed by the backends. add_subdirectory(Common) +set(LLVM_LINK_COMPONENTS Support) + # llvm-min-tablegen only contains a subset of backends necessary to # build llvm/include. It must not depend on TableGenCommon, as # TableGenCommon depends on this already to generate things such as # ValueType definitions. -# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included -# into LLVMTableGenBasic to avoid redundant compilation and problems with build -# caches. -# At least one source file must be included directly to avoid CMake problems. -# E.g. CMake derives which linker to use from the types of sources added. add_tablegen(llvm-min-tblgen LLVM_HEADERS - llvm-min-tblgen.cpp + TableGen.cpp + ARMTargetDefEmitter.cpp + Attributes.cpp + DirectiveEmitter.cpp + IntrinsicEmitter.cpp + RISCVTargetDefEmitter.cpp + VTEmitter.cpp $ PARTIAL_SOURCES_INTENDED @@ -29,8 +32,10 @@ set(LLVM_LINK_COMPONENTS add_tablegen(llvm-tblgen LLVM DESTINATION "${LLVM_TOOLS_INSTALL_DIR}" EXPORT LLVM + ARMTargetDefEmitter.cpp AsmMatcherEmitter.cpp AsmWriterEmitter.cpp + Attributes.cpp CallingConvEmitter.cpp CodeEmitterGen.cpp CodeGenMapTable.cpp @@ -43,6 +48,7 @@ add_tablegen(llvm-tblgen LLVM DecoderEmitter.cpp DFAEmitter.cpp DFAPacketizerEmitter.cpp + DirectiveEmitter.cpp DisassemblerEmitter.cpp DXILEmitter.cpp ExegesisEmitter.cpp @@ -51,15 +57,18 @@ add_tablegen(llvm-tblgen LLVM GlobalISelEmitter.cpp InstrDocsEmitter.cpp InstrInfoEmitter.cpp - llvm-tblgen.cpp + IntrinsicEmitter.cpp MacroFusionPredicatorEmitter.cpp OptionParserEmitter.cpp OptionRSTEmitter.cpp PseudoLoweringEmitter.cpp RegisterBankEmitter.cpp RegisterInfoEmitter.cpp + RISCVTargetDefEmitter.cpp SearchableTableEmitter.cpp SubtargetEmitter.cpp + TableGen.cpp + VTEmitter.cpp WebAssemblyDisassemblerEmitter.cpp X86InstrMappingEmitter.cpp X86DisassemblerTables.cpp @@ -70,8 +79,6 @@ add_tablegen(llvm-tblgen LLVM $ $ - PARTIAL_SOURCES_INTENDED - DEPENDS intrinsics_gen # via llvm-min-tablegen ) diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/DirectiveEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/DirectiveEmitter.cpp rename to llvm/utils/TableGen/DirectiveEmitter.cpp diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp similarity index 99% rename from llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp rename to llvm/utils/TableGen/IntrinsicEmitter.cpp index fc2b8908a35b8..093602c3da804 100644 --- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "CodeGenIntrinsics.h" -#include "SequenceToOffsetTable.h" +#include "Basic/CodeGenIntrinsics.h" +#include "Basic/SequenceToOffsetTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp rename to llvm/utils/TableGen/RISCVTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Basic/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp similarity index 94% rename from llvm/utils/TableGen/Basic/TableGen.cpp rename to llvm/utils/TableGen/TableGen.cpp index 80ac93f2b54fb..bea2a2e735dbe 100644 --- a/llvm/utils/TableGen/Basic/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -6,12 +6,10 @@ // //===----------------------------------------------------------------------===// // -// This file contains the global defintions (mostly command line parameters) -// shared between llvm-tblgen and llvm-min-tblgen. +// This file contains the main function for LLVM's TableGen. // //===----------------------------------------------------------------------===// -#include "TableGen.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -76,7 +74,7 @@ static TableGen::Emitter::Opt X[] = { {"print-sets", printSets, "Print expanded sets for testing DAG exprs"}, }; -int tblgen_main(int argc, char **argv) { +int main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); diff --git a/llvm/utils/TableGen/Basic/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/Basic/VTEmitter.cpp rename to llvm/utils/TableGen/VTEmitter.cpp diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp deleted file mode 100644 index 79fce5c555f6e..0000000000000 --- a/llvm/utils/TableGen/llvm-min-tblgen.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===- llvm-min-tblgen.cpp ------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the main function for LLVM's TableGen. -// -//===----------------------------------------------------------------------===// - -#include "Basic/TableGen.h" - -/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. -/// The indirection to tblgen_main exists to ensure that the static variables -/// for the llvm::cl:: mechanism are linked into both executables. -int main(int argc, char **argv) { return tblgen_main(argc, argv); } diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp deleted file mode 100644 index a38382472a992..0000000000000 --- a/llvm/utils/TableGen/llvm-tblgen.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===- llvm-tblgen.cpp ----------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the main function for LLVM's TableGen. -// -//===----------------------------------------------------------------------===// - -#include "Basic/TableGen.h" - -/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. -/// The indirection to tblgen_main exists to ensure that the static variables -/// for the llvm::cl:: mechanism are linked into both executables. -int main(int argc, char **argv) { return tblgen_main(argc, argv); } From 4b577830033066cfd1b2acf4fcf39950678b27bd Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Thu, 2 Jan 2025 22:30:39 +0000 Subject: [PATCH 030/480] [compiler-rt][rtsan] fopencookie support. (#120864) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 7 ++++++ .../tests/rtsan_test_interceptors_posix.cpp | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 4e51f464b5730..072923ab35ae0 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -297,6 +297,12 @@ INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { return REAL(fdopen)(fd, mode); } +INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode, + cookie_io_functions_t funcs) { + __rtsan_notify_intercepted_call("fopencookie"); + return REAL(fopencookie)(cookie, mode, funcs); +} + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) { __rtsan_notify_intercepted_call("open_memstream"); @@ -972,6 +978,7 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(fputs); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); + INTERCEPT_FUNCTION(fopencookie); RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; INTERCEPT_FUNCTION(lseek); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index b052dd859dcdf..c9c4d7fc4e99e 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -353,6 +353,29 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + struct fholder { + FILE *fp; + size_t read; + } fh = {f, 0}; + auto CookieRead = [this](void *cookie, char *buf, size_t size) { + fholder *p = reinterpret_cast(cookie); + p->read = fread(static_cast(buf), 1, size, p->fp); + EXPECT_NE(0, p->read); + }; + cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, + nullptr, nullptr}; + auto Func = [&fh, &funcs]() { + FILE *f = fopencookie(&fh, "w", funcs); + EXPECT_THAT(f, Ne(nullptr)); + }; + + ExpectRealtimeDeath(Func, "fopencookie"); + ExpectNonRealtimeSurvival(Func); +} + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) { char *buffer; From c703b4645c79e889fd6a0f3f64f01f957d981aa4 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Thu, 2 Jan 2025 14:40:15 -0800 Subject: [PATCH 031/480] [mlir][py] Enable loading only specified dialects during creation. (#121421) Gives option post as global list as well as arg to control which dialects are loaded during context creation. This enables setting either a good base set or skipping in individual cases. --- mlir/python/mlir/_mlir_libs/__init__.py | 42 +++++++++++++++++++++++-- mlir/python/mlir/ir.py | 6 +++- mlir/test/python/ir/dialects.py | 36 +++++++++++++++++++++ 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/mlir/python/mlir/_mlir_libs/__init__.py b/mlir/python/mlir/_mlir_libs/__init__.py index c5cb22c6dccb8..d021dde05dd87 100644 --- a/mlir/python/mlir/_mlir_libs/__init__.py +++ b/mlir/python/mlir/_mlir_libs/__init__.py @@ -58,6 +58,7 @@ def get_include_dirs() -> Sequence[str]: # needs. _dialect_registry = None +_load_on_create_dialects = None def get_dialect_registry(): @@ -71,6 +72,21 @@ def get_dialect_registry(): return _dialect_registry +def append_load_on_create_dialect(dialect: str): + global _load_on_create_dialects + if _load_on_create_dialects is None: + _load_on_create_dialects = [dialect] + else: + _load_on_create_dialects.append(dialect) + + +def get_load_on_create_dialects(): + global _load_on_create_dialects + if _load_on_create_dialects is None: + _load_on_create_dialects = [] + return _load_on_create_dialects + + def _site_initialize(): import importlib import itertools @@ -132,15 +148,35 @@ def process_initializer_module(module_name): break class Context(ir._BaseContext): - def __init__(self, *args, **kwargs): + def __init__(self, load_on_create_dialects=None, *args, **kwargs): super().__init__(*args, **kwargs) self.append_dialect_registry(get_dialect_registry()) for hook in post_init_hooks: hook(self) if not disable_multithreading: self.enable_multithreading(True) - if not disable_load_all_available_dialects: - self.load_all_available_dialects() + if load_on_create_dialects is not None: + logger.debug( + "Loading all dialects from load_on_create_dialects arg %r", + load_on_create_dialects, + ) + for dialect in load_on_create_dialects: + # This triggers loading the dialect into the context. + _ = self.dialects[dialect] + else: + if disable_load_all_available_dialects: + dialects = get_load_on_create_dialects() + if dialects: + logger.debug( + "Loading all dialects from global load_on_create_dialects %r", + dialects, + ) + for dialect in dialects: + # This triggers loading the dialect into the context. + _ = self.dialects[dialect] + else: + logger.debug("Loading all available dialects") + self.load_all_available_dialects() if init_module: logger.debug( "Registering translations from initializer %r", init_module diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py index 9a6ce462047ad..6f37266d5bf39 100644 --- a/mlir/python/mlir/ir.py +++ b/mlir/python/mlir/ir.py @@ -5,7 +5,11 @@ from ._mlir_libs._mlir.ir import * from ._mlir_libs._mlir.ir import _GlobalDebug from ._mlir_libs._mlir import register_type_caster, register_value_caster -from ._mlir_libs import get_dialect_registry +from ._mlir_libs import ( + get_dialect_registry, + append_load_on_create_dialect, + get_load_on_create_dialects, +) # Convenience decorator for registering user-friendly Attribute builders. diff --git a/mlir/test/python/ir/dialects.py b/mlir/test/python/ir/dialects.py index d59c6a6bc424e..5a2ed684d298b 100644 --- a/mlir/test/python/ir/dialects.py +++ b/mlir/test/python/ir/dialects.py @@ -121,3 +121,39 @@ def testAppendPrefixSearchPath(): sys.path.append(".") _cext.globals.append_dialect_search_prefix("custom_dialect") assert _cext.globals._check_dialect_module_loaded("custom") + + +# CHECK-LABEL: TEST: testDialectLoadOnCreate +@run +def testDialectLoadOnCreate(): + with Context(load_on_create_dialects=[]) as ctx: + ctx.emit_error_diagnostics = True + ctx.allow_unregistered_dialects = True + + def callback(d): + # CHECK: DIAGNOSTIC + # CHECK-SAME: op created with unregistered dialect + print(f"DIAGNOSTIC={d.message}") + return True + + handler = ctx.attach_diagnostic_handler(callback) + loc = Location.unknown(ctx) + try: + op = Operation.create("arith.addi", loc=loc) + ctx.allow_unregistered_dialects = False + op.verify() + except MLIRError as e: + pass + + with Context(load_on_create_dialects=["func"]) as ctx: + loc = Location.unknown(ctx) + fn = Operation.create("func.func", loc=loc) + + # TODO: This may require an update if a site wide policy is set. + # CHECK: Load on create: [] + print(f"Load on create: {get_load_on_create_dialects()}") + append_load_on_create_dialect("func") + # CHECK: Load on create: + # CHECK-SAME: func + print(f"Load on create: {get_load_on_create_dialects()}") + print(get_load_on_create_dialects()) From 976f3a078bbac1889aa9e68e297f73f111a896d6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Jan 2025 15:05:52 -0800 Subject: [PATCH 032/480] [LLVM] Update RISCV maintainers (#121301) Add Philip Reames and myself as maintainers. I think between the two of us we do a lot of the patch reviews. --- llvm/Maintainers.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index 6d0fda148ce87..fca00ca12f401 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -249,7 +249,11 @@ czhengsz@cn.ibm.com (email), [chenzheng1030](https://github.com/chenzheng1030) ( #### RISCV backend Alex Bradbury \ -asb@igalia.com (email), [asb](https://github.com/asb) (GitHub) +asb@igalia.com (email), [asb](https://github.com/asb) (GitHub) \ +Craig Topper \ +craig.topper@sifive.com (email), [topperc](https://github.com/topperc) (GitHub) \ +Philip Reames \ +listmail@philipreames.com (email), [preames](https://github.com/preames) (GitHub) #### Sparc backend From 2291d0aba927b885cf39150e59fde466a2524bb5 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 2 Jan 2025 16:28:55 -0800 Subject: [PATCH 033/480] [DAGCombiner] Turn `(neg (max x, (neg x)))` into `(min x, (neg x))` (#120666) This pattern was originally spotted in 429.mcf by @topperc. We already have a DAGCombiner pattern to turn `(neg (abs x))` into `(min x, (neg x))`. But in some cases `(neg (max x, (neg x)))` is formed by an expanded `abs` followed by a `neg` that is generated only after the `abs` expansion. This patch adds a separate pattern to match cases like this, as well as its inverse pattern: `(neg (min X, (neg X))) --> (max X, (neg X))`. This pattern is applicable to both signed and unsigned min/max. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 + .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 15 + llvm/test/CodeGen/RISCV/neg-abs.ll | 444 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll | 54 +++ 5 files changed, 534 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 69820aed2137b..604dc9419025b 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1495,6 +1495,10 @@ inline bool isBitwiseLogicOp(unsigned Opcode) { return Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::XOR; } +/// Given a \p MinMaxOpc of ISD::(U|S)MIN or ISD::(U|S)MAX, returns +/// ISD::(U|S)MAX and ISD::(U|S)MIN, respectively. +NodeType getInverseMinMaxOpcode(unsigned MinMaxOpc); + /// Get underlying scalar opcode for VECREDUCE opcode. /// For example ISD::AND for ISD::VECREDUCE_AND. NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6b2501591c81a..9ec3310b5219b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3949,6 +3949,23 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true)) return Result; + // Similar to the previous rule, but this time targeting an expanded abs. + // (sub 0, (max X, (sub 0, X))) --> (min X, (sub 0, X)) + // as well as + // (sub 0, (min X, (sub 0, X))) --> (max X, (sub 0, X)) + // Note that these two are applicable to both signed and unsigned min/max. + SDValue X; + SDValue S0; + auto NegPat = m_AllOf(m_Neg(m_Deferred(X)), m_Value(S0)); + if (sd_match(N1, m_OneUse(m_AnyOf(m_SMax(m_Value(X), NegPat), + m_UMax(m_Value(X), NegPat), + m_SMin(m_Value(X), NegPat), + m_UMin(m_Value(X), NegPat))))) { + unsigned NewOpc = ISD::getInverseMinMaxOpcode(N1->getOpcode()); + if (hasOperation(NewOpc, VT)) + return DAG.getNode(NewOpc, DL, VT, X, S0); + } + // Fold neg(splat(neg(x)) -> splat(x) if (VT.isVector()) { SDValue N1S = DAG.getSplatValue(N1, true); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 10e8ba93359fb..0dfd0302ae543 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -430,6 +430,21 @@ bool ISD::matchBinaryPredicate( return true; } +ISD::NodeType ISD::getInverseMinMaxOpcode(unsigned MinMaxOpc) { + switch (MinMaxOpc) { + default: + llvm_unreachable("unrecognized opcode"); + case ISD::UMIN: + return ISD::UMAX; + case ISD::UMAX: + return ISD::UMIN; + case ISD::SMIN: + return ISD::SMAX; + case ISD::SMAX: + return ISD::SMIN; + } +} + ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) { switch (VecReduceOpcode) { default: diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index 7d6a6d7ed4ce6..fe19a4fa8bbd8 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -258,3 +258,447 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) { %neg = sub nsw i64 0, %abs ret i64 %neg } + +define i32 @expanded_neg_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a0, a1, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: min a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: blt a1, a0, .LBB6_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB6_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a0, a1, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: minu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: bltu a1, a0, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB8_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB8_3 +; RV32I-NEXT: j .LBB8_4 +; RV32I-NEXT: .LBB8_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB8_4 +; RV32I-NEXT: .LBB8_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB8_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB8_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB8_3 +; RV32ZBB-NEXT: j .LBB8_4 +; RV32ZBB-NEXT: .LBB8_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB8_4 +; RV32ZBB-NEXT: .LBB8_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB8_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a0, a1, .LBB8_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB8_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: min a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB9_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB9_3 +; RV32I-NEXT: j .LBB9_4 +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB9_4 +; RV32I-NEXT: .LBB9_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB9_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB9_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB9_3 +; RV32ZBB-NEXT: j .LBB9_4 +; RV32ZBB-NEXT: .LBB9_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB9_4 +; RV32ZBB-NEXT: .LBB9_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB9_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a0, a1, .LBB9_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB9_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: minu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i32 @expanded_neg_inv_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a1, a0, .LBB10_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB10_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: max a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: blt a0, a1, .LBB10_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB10_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: min a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a1, a0, .LBB11_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB11_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: maxu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: bltu a0, a1, .LBB11_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB11_2: +; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: minu a0, a0, a1 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_inv_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB12_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB12_3 +; RV32I-NEXT: j .LBB12_4 +; RV32I-NEXT: .LBB12_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB12_4 +; RV32I-NEXT: .LBB12_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB12_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB12_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB12_3 +; RV32ZBB-NEXT: j .LBB12_4 +; RV32ZBB-NEXT: .LBB12_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB12_4 +; RV32ZBB-NEXT: .LBB12_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB12_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a1, a0, .LBB12_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB12_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB13_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB13_3 +; RV32I-NEXT: j .LBB13_4 +; RV32I-NEXT: .LBB13_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB13_4 +; RV32I-NEXT: .LBB13_3: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB13_4: +; RV32I-NEXT: snez a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB13_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB13_3 +; RV32ZBB-NEXT: j .LBB13_4 +; RV32ZBB-NEXT: .LBB13_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB13_4 +; RV32ZBB-NEXT: .LBB13_3: +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB13_4: +; RV32ZBB-NEXT: snez a0, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a1, a0, .LBB13_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB13_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll new file mode 100644 index 0000000000000..6f1efb6885dee --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s + +define <2 x i64> @expanded_fixed_neg_abs64(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_abs64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmin.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} + +define <2 x i64> @expanded_fixed_neg_abs64_unsigned(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_abs64_unsigned: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vminu.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} + +define <2 x i64> @expanded_fixed_neg_inv_abs64(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_inv_abs64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} + +define <2 x i64> @expanded_fixed_neg_inv_abs64_unsigned(<2 x i64> %x) { +; CHECK-LABEL: expanded_fixed_neg_inv_abs64_unsigned: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmaxu.vv v8, v8, v9 +; CHECK-NEXT: ret + %t = sub <2 x i64> , %x + %t1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %t, <2 x i64> %x) + %t2 = sub <2 x i64> , %t1 + ret <2 x i64> %t2 +} From 3cac26f5419b68d37e1919001e1c46a765df294f Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 2 Jan 2025 16:29:34 -0800 Subject: [PATCH 034/480] [GISel] Combine `(neg (min/max x, (neg x)))` into `(max/min x, (neg x))` (#120998) This is the GISel version of #120666. Also supports both unsigned and signed version of min & max. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 4 + .../include/llvm/Target/GlobalISel/Combine.td | 8 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 28 ++ llvm/lib/CodeGen/GlobalISel/Utils.cpp | 15 + llvm/lib/Target/RISCV/RISCVCombine.td | 2 +- .../RISCV/GlobalISel/combine-neg-abs.ll | 457 ++++++++++++++++++ 7 files changed, 515 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 871456d2a55b5..94e36e412b0cf 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -864,6 +864,9 @@ class CombinerHelper { /// Combine select to integer min/max. bool matchSelectIMinMax(const MachineOperand &MO, BuildFnTy &MatchInfo) const; + /// Tranform (neg (min/max x, (neg x))) into (max/min x, (neg x)). + bool matchSimplifyNegMinMax(MachineInstr &MI, BuildFnTy &MatchInfo) const; + /// Combine selects. bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index cb5a4c14b364c..a35ecae5d18bf 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -171,6 +171,10 @@ void reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R); +/// Returns the inverse opcode of \p MinMaxOpc, which is a generic min/max +/// opcode like G_SMIN. +unsigned getInverseGMinMaxOpcode(unsigned MinMaxOpc); + /// If \p VReg is defined by a G_CONSTANT, return the corresponding value. std::optional getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 013c3a6ed83d8..8641eabbdd84c 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1372,6 +1372,12 @@ def select_to_iminmax: GICombineRule< [{ return Helper.matchSelectIMinMax(${root}, ${info}); }]), (apply [{ Helper.applyBuildFnMO(${root}, ${info}); }])>; +def simplify_neg_minmax : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SUB):$root, + [{ return Helper.matchSimplifyNegMinMax(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + def match_selects : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_SELECT):$root, @@ -2008,7 +2014,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, - combine_concat_vector, + simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, combine_use_vector_truncate, merge_combines, overflow_combines]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index c061c01d3c1b1..4e3aaf5da7198 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -7062,6 +7062,34 @@ bool CombinerHelper::matchSelectIMinMax(const MachineOperand &MO, } } +// (neg (min/max x, (neg x))) --> (max/min x, (neg x)) +bool CombinerHelper::matchSimplifyNegMinMax(MachineInstr &MI, + BuildFnTy &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_SUB); + Register DestReg = MI.getOperand(0).getReg(); + LLT DestTy = MRI.getType(DestReg); + + Register X; + Register Sub0; + auto NegPattern = m_all_of(m_Neg(m_DeferredReg(X)), m_Reg(Sub0)); + if (mi_match(DestReg, MRI, + m_Neg(m_OneUse(m_any_of(m_GSMin(m_Reg(X), NegPattern), + m_GSMax(m_Reg(X), NegPattern), + m_GUMin(m_Reg(X), NegPattern), + m_GUMax(m_Reg(X), NegPattern)))))) { + MachineInstr *MinMaxMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + unsigned NewOpc = getInverseGMinMaxOpcode(MinMaxMI->getOpcode()); + if (isLegal({NewOpc, {DestTy}})) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildInstr(NewOpc, {DestReg}, {X, Sub0}); + }; + return true; + } + } + + return false; +} + bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const { GSelect *Select = cast(&MI); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 79382933a1f42..625d556e3ff5e 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -276,6 +276,21 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, reportGISelFailure(MF, TPC, MORE, R); } +unsigned llvm::getInverseGMinMaxOpcode(unsigned MinMaxOpc) { + switch (MinMaxOpc) { + case TargetOpcode::G_SMIN: + return TargetOpcode::G_SMAX; + case TargetOpcode::G_SMAX: + return TargetOpcode::G_SMIN; + case TargetOpcode::G_UMIN: + return TargetOpcode::G_UMAX; + case TargetOpcode::G_UMAX: + return TargetOpcode::G_UMIN; + default: + llvm_unreachable("unrecognized opcode"); + } +} + std::optional llvm::getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI) { std::optional ValAndVReg = getIConstantVRegValWithLookThrough( diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td index 030613a7d8904..995dd0c5d82eb 100644 --- a/llvm/lib/Target/RISCV/RISCVCombine.td +++ b/llvm/lib/Target/RISCV/RISCVCombine.td @@ -25,5 +25,5 @@ def RISCVPostLegalizerCombiner : GICombiner<"RISCVPostLegalizerCombinerImpl", [sub_to_add, combines_for_extload, redundant_and, identity_combines, shift_immed_chain, - commute_constant_to_rhs]> { + commute_constant_to_rhs, simplify_neg_minmax]> { } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll new file mode 100644 index 0000000000000..6c848ecf0fffd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll @@ -0,0 +1,457 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32I +; RUN: llc -mtriple=riscv32 -global-isel -mattr=+zbb -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32ZBB +; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV64I +; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbb -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV64ZBB + +define i32 @expanded_neg_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a0, a1, .LBB0_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB0_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: min a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: blt a3, a2, .LBB0_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB0_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: max a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a0, a1, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: minu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: bltu a3, a2, .LBB1_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB1_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: maxu a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umax.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB2_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB2_3 +; RV32I-NEXT: j .LBB2_4 +; RV32I-NEXT: .LBB2_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB2_4 +; RV32I-NEXT: .LBB2_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB2_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB2_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB2_3 +; RV32ZBB-NEXT: j .LBB2_4 +; RV32ZBB-NEXT: .LBB2_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB2_4 +; RV32ZBB-NEXT: .LBB2_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB2_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a0, a1, .LBB2_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB2_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: min a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a2 +; RV32I-NEXT: beqz a4, .LBB3_3 +; RV32I-NEXT: j .LBB3_4 +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: bnez a4, .LBB3_4 +; RV32I-NEXT: .LBB3_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB3_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB3_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a1, a2 +; RV32ZBB-NEXT: beqz a4, .LBB3_3 +; RV32ZBB-NEXT: j .LBB3_4 +; RV32ZBB-NEXT: .LBB3_2: +; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: bnez a4, .LBB3_4 +; RV32ZBB-NEXT: .LBB3_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB3_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a0, a1, .LBB3_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB3_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: minu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umax.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i32 @expanded_neg_inv_abs32(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: blt a1, a0, .LBB4_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB4_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: max a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: blt a2, a3, .LBB4_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB4_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: min a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.smin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) { +; RV32I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: bltu a1, a0, .LBB5_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB5_2: +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: maxu a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: bltu a2, a3, .LBB5_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB5_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: negw a1, a0 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: minu a0, a1, a0 +; RV64ZBB-NEXT: neg a0, a0 +; RV64ZBB-NEXT: ret + %n = sub i32 0, %x + %t = call i32 @llvm.umin.i32(i32 %n, i32 %x) + %r = sub i32 0, %t + ret i32 %r +} + +define i64 @expanded_neg_inv_abs64(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB6_3 +; RV32I-NEXT: j .LBB6_4 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB6_4 +; RV32I-NEXT: .LBB6_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB6_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB6_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB6_3 +; RV32ZBB-NEXT: j .LBB6_4 +; RV32ZBB-NEXT: .LBB6_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB6_4 +; RV32ZBB-NEXT: .LBB6_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB6_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: blt a1, a0, .LBB6_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB6_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.smin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} + +define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) { +; RV32I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32I: # %bb.0: +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: sub a2, a3, a2 +; RV32I-NEXT: neg a3, a0 +; RV32I-NEXT: beq a2, a1, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: beqz a4, .LBB7_3 +; RV32I-NEXT: j .LBB7_4 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: bnez a4, .LBB7_4 +; RV32I-NEXT: .LBB7_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: .LBB7_4: +; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a3, a1 +; RV32ZBB-NEXT: sub a2, a3, a2 +; RV32ZBB-NEXT: neg a3, a0 +; RV32ZBB-NEXT: beq a2, a1, .LBB7_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: beqz a4, .LBB7_3 +; RV32ZBB-NEXT: j .LBB7_4 +; RV32ZBB-NEXT: .LBB7_2: +; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB7_4 +; RV32ZBB-NEXT: .LBB7_3: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: .LBB7_4: +; RV32ZBB-NEXT: neg a0, a3 +; RV32ZBB-NEXT: snez a1, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64I-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: bltu a1, a0, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: neg a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: ret + %n = sub i64 0, %x + %t = call i64 @llvm.umin.i64(i64 %n, i64 %x) + %r = sub i64 0, %t + ret i64 %r +} From 4010e0c45b87e4d073c407cae787e96d4808ad36 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 2 Jan 2025 16:57:31 -0800 Subject: [PATCH 035/480] [libc] Use __attribute__((__noreturn__)) for _Noreturn in C < 11 (#121252) When in modes like C99, the _Noreturn keyword is not available in C. But GNU-compatible compilers have a `noreturn` attribute with the same effect on function declarations. --- libc/include/__llvm-libc-common.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index d54ee7b9f91f3..d9d70aff771c0 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -52,6 +52,16 @@ #undef __restrict #define __restrict restrict // C99 and above support the restrict keyword. +#undef _Noreturn +#if __STDC_VERSION__ >= 201112L +// In C11 and later, _Noreturn is a keyword. +#elif defined(__GNUC__) +// GNU-compatible compilers have an equivalent attribute. +#define _Noreturn __attribute__((__noreturn__)) +#else +#define _Noreturn +#endif + #undef __NOEXCEPT #ifdef __GNUC__ #define __NOEXCEPT __attribute__((__nothrow__)) From 4b17a8b10ebb69d3bd30ee7714b5ca24f7e944dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 2 Jan 2025 17:02:45 -0800 Subject: [PATCH 036/480] [flang][cuda] Add operation to sync global descriptor (#121520) Introduce cuf.sync_descriptor to be used to sync device global descriptor after pointer association. Also move CUFCommon so it can be used in FIRBuilder lib as well. --- .../{Transforms => Builder}/CUFCommon.h | 0 .../flang/Optimizer/Dialect/CUF/CUFOps.td | 11 +++++++++++ flang/lib/Lower/Allocatable.cpp | 19 +++++++++++++++++++ flang/lib/Optimizer/Builder/CMakeLists.txt | 1 + .../{Transforms => Builder}/CUFCommon.cpp | 2 +- flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 - .../Transforms/CUFAddConstructor.cpp | 2 +- .../Optimizer/Transforms/CUFDeviceGlobal.cpp | 2 +- .../Optimizer/Transforms/CUFOpConversion.cpp | 2 +- .../Transforms/SimplifyIntrinsics.cpp | 2 +- flang/test/Lower/CUDA/cuda-pointer-sync.cuf | 17 +++++++++++++++++ 11 files changed, 53 insertions(+), 6 deletions(-) rename flang/include/flang/Optimizer/{Transforms => Builder}/CUFCommon.h (100%) rename flang/lib/Optimizer/{Transforms => Builder}/CUFCommon.cpp (97%) create mode 100644 flang/test/Lower/CUDA/cuda-pointer-sync.cuf diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h similarity index 100% rename from flang/include/flang/Optimizer/Transforms/CUFCommon.h rename to flang/include/flang/Optimizer/Builder/CUFCommon.h diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index d06587c57d44b..9a31ffa2e9471 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -140,6 +140,17 @@ def cuf_DeallocateOp : cuf_Op<"deallocate", let hasVerifier = 1; } +def cuf_SyncDescriptorOp : cuf_Op<"sync_descriptor", []> { + let summary = + "Synchronize the host and device descriptor of a Fortran pointer"; + + let arguments = (ins SymbolRefAttr:$globalName); + + let assemblyFormat = [{ + $globalName attr-dict + }]; +} + def cuf_DataTransferOp : cuf_Op<"data_transfer", []> { let summary = "Represent a data transfer between host and device memory"; diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index fb8380ac7e8c5..4c64870675816 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -22,12 +22,14 @@ #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/Runtime.h" #include "flang/Lower/StatementContext.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/FatalError.h" #include "flang/Optimizer/Support/InternalNames.h" #include "flang/Parser/parse-tree.h" @@ -1086,6 +1088,22 @@ bool Fortran::lower::isArraySectionWithoutVectorSubscript( !Fortran::evaluate::HasVectorSubscript(expr); } +static void genCUFPointerSync(const mlir::Value box, + fir::FirOpBuilder &builder) { + if (auto declareOp = box.getDefiningOp()) { + if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) { + auto mod = addrOfOp->getParentOfType(); + if (auto globalOp = + mod.lookupSymbol(addrOfOp.getSymbol())) { + if (cuf::isRegisteredDeviceGlobal(globalOp)) { + builder.create(box.getLoc(), + addrOfOp.getSymbol()); + } + } + } + } +} + void Fortran::lower::associateMutableBox( Fortran::lower::AbstractConverter &converter, mlir::Location loc, const fir::MutableBoxValue &box, const Fortran::lower::SomeExpr &source, @@ -1098,6 +1116,7 @@ void Fortran::lower::associateMutableBox( if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx); fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds); + genCUFPointerSync(box.getAddr(), builder); return; } // The right hand side is not be evaluated into a temp. Array sections can diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 05164d41a4cb5..a824d70fdb5c7 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FIRBuilder BoxValue.cpp Character.cpp Complex.cpp + CUFCommon.cpp DoLoopHelper.cpp FIRBuilder.cpp HLFIRTools.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp similarity index 97% rename from flang/lib/Optimizer/Transforms/CUFCommon.cpp rename to flang/lib/Optimizer/Builder/CUFCommon.cpp index bbe33217e8f45..81a8a90ce394e 100644 --- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "flang/Optimizer/Transforms/CUFCommon.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 9eafa4ec234bd..d20d3bc4108ce 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -9,7 +9,6 @@ add_flang_library(FIRTransforms CompilerGeneratedNames.cpp ConstantArgumentGlobalisation.cpp ControlFlowConverter.cpp - CUFCommon.cpp CUFAddConstructor.cpp CUFDeviceGlobal.cpp CUFOpConversion.cpp diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 9591f48c5d417..97551595db039 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" @@ -19,7 +20,6 @@ #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Support/DataLayout.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/registration.h" #include "flang/Runtime/entry-names.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 07cc1f3b4b51c..2e6c272fa9089 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -7,12 +7,12 @@ //===----------------------------------------------------------------------===// #include "flang/Common/Fortran.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/InternalNames.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/allocatable.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index de5c51556eecf..fb0ef24654644 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -8,6 +8,7 @@ #include "flang/Optimizer/Transforms/CUFOpConversion.h" #include "flang/Common/Fortran.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" @@ -15,7 +16,6 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/DataLayout.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/allocatable.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index d3567f453fceb..fa6a7b23624e8 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -24,6 +24,7 @@ #include "flang/Common/Fortran.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/LowLevelIntrinsics.h" #include "flang/Optimizer/Builder/Todo.h" @@ -31,7 +32,6 @@ #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" -#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Optimizer/Transforms/Passes.h" #include "flang/Optimizer/Transforms/Utils.h" #include "flang/Runtime/entry-names.h" diff --git a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf new file mode 100644 index 0000000000000..e17869b2d6357 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf @@ -0,0 +1,17 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +module devptr + real, device, pointer, dimension(:) :: dev_ptr +end module + +use devptr +real, device, target, dimension(4) :: a_dev +a_dev = 42.0 +dev_ptr => a_dev +end + +! CHECK: fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>> +! CHECK-LABEL: func.func @_QQmain() +! CHECK: fir.embox +! CHECK: fir.store +! CHECK: cuf.sync_descriptor @_QMdevptrEdev_ptr From 6dcd2b035da34fa53693b401139a419adb7342db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 2 Jan 2025 17:02:59 -0800 Subject: [PATCH 037/480] [flang][cuda] Convert cuf.sync_descriptor to runtime call (#121524) Convert the op to a new entry point in the runtime `CUFSyncGlobalDescriptor` --- flang/include/flang/Runtime/CUDA/descriptor.h | 4 ++ .../Optimizer/Transforms/CUFOpConversion.cpp | 42 ++++++++++++++++++- flang/runtime/CUDA/descriptor.cpp | 7 ++++ flang/test/Fir/CUDA/cuda-sync-desc.mlir | 20 +++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 flang/test/Fir/CUDA/cuda-sync-desc.mlir diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h index 55878aaac57fb..0ee7feca10e44 100644 --- a/flang/include/flang/Runtime/CUDA/descriptor.h +++ b/flang/include/flang/Runtime/CUDA/descriptor.h @@ -33,6 +33,10 @@ void *RTDECL(CUFGetDeviceAddress)( void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, const char *sourceFile = nullptr, int sourceLine = 0); +/// Get the device address of registered with the \p hostPtr and sync them. +void RTDECL(CUFSyncGlobalDescriptor)( + void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0); + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index fb0ef24654644..f08f9e412b885 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -788,6 +788,45 @@ struct CUFLaunchOpConversion const mlir::SymbolTable &symTab; }; +struct CUFSyncDescriptorOpConversion + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + CUFSyncDescriptorOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symTab) + : OpRewritePattern(context), symTab{symTab} {} + + mlir::LogicalResult + matchAndRewrite(cuf::SyncDescriptorOp op, + mlir::PatternRewriter &rewriter) const override { + auto mod = op->getParentOfType(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + + auto globalOp = mod.lookupSymbol(op.getGlobalName()); + if (!globalOp) + return mlir::failure(); + + auto hostAddr = builder.create( + loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName()); + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, + builder); + auto fTy = callee.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, hostAddr, sourceFile, sourceLine)}; + builder.create(loc, callee, args); + op.erase(); + return mlir::success(); + } + +private: + const mlir::SymbolTable &symTab; +}; + class CUFOpConversion : public fir::impl::CUFOpConversionBase { public: void runOnOperation() override { @@ -851,7 +890,8 @@ void cuf::populateCUFToFIRConversionPatterns( CUFFreeOpConversion>(patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp index 391c47e84241d..947eeb66aa3d6 100644 --- a/flang/runtime/CUDA/descriptor.cpp +++ b/flang/runtime/CUDA/descriptor.cpp @@ -46,6 +46,13 @@ void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice)); } +void RTDEF(CUFSyncGlobalDescriptor)( + void *hostPtr, const char *sourceFile, int sourceLine) { + void *devAddr{RTNAME(CUFGetDeviceAddress)(hostPtr, sourceFile, sourceLine)}; + RTNAME(CUFDescriptorSync) + ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine); +} + RT_EXT_API_GROUP_END } } // namespace Fortran::runtime::cuda diff --git a/flang/test/Fir/CUDA/cuda-sync-desc.mlir b/flang/test/Fir/CUDA/cuda-sync-desc.mlir new file mode 100644 index 0000000000000..20b317f34a7f2 --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-sync-desc.mlir @@ -0,0 +1,20 @@ +// RUN: fir-opt --cuf-convert %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git f37e52237791f58438790c77edeb8de08f692987)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>> { + %0 = fir.zero_bits !fir.ptr> + %c0 = arith.constant 0 : index + %1 = fir.shape %c0 : (index) -> !fir.shape<1> + %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr>, !fir.shape<1>) -> !fir.box>> + fir.has_value %2 : !fir.box>> + } + func.func @_QQmain() { + cuf.sync_descriptor @_QMdevptrEdev_ptr + return + } +} + +// CHECK-LABEL: func.func @_QQmain() +// CHECK: %[[HOST_ADDR:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref>>> +// CHECK: %[[HOST_ADDR_PTR:.*]] = fir.convert %[[HOST_ADDR]] : (!fir.ref>>>) -> !fir.llvm_ptr +// CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[HOST_ADDR_PTR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) From 532a2691bc015fafdd356c10b17c466fe28c49b1 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Fri, 3 Jan 2025 06:33:27 +0530 Subject: [PATCH 038/480] [RISCV] Add Qualcomm uC Xqcicli (Conditional Load Immediate) extension (#121292) This extension adds 12 instructions that conditionally load an immediate value. The current spec can be found at: https://github.com/quic/riscv-unified-db/releases/latest This patch adds assembler only support. --- .../Driver/print-supported-extensions-riscv.c | 1 + llvm/docs/RISCVUsage.rst | 3 + llvm/docs/ReleaseNotes.md | 2 + .../RISCV/Disassembler/RISCVDisassembler.cpp | 3 + llvm/lib/Target/RISCV/RISCVFeatures.td | 8 + llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 27 ++ llvm/lib/TargetParser/RISCVISAInfo.cpp | 3 +- llvm/test/CodeGen/RISCV/attributes.ll | 2 + llvm/test/MC/RISCV/xqcicli-invalid.s | 232 ++++++++++++++++++ llvm/test/MC/RISCV/xqcicli-valid.s | 59 +++++ .../TargetParser/RISCVISAInfoTest.cpp | 4 +- 11 files changed, 342 insertions(+), 2 deletions(-) create mode 100644 llvm/test/MC/RISCV/xqcicli-invalid.s create mode 100644 llvm/test/MC/RISCV/xqcicli-valid.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 8e46690cce5a6..395501eb85ccc 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -190,6 +190,7 @@ // CHECK-NEXT: svukte 0.3 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses) // CHECK-NEXT: xqcia 0.2 'Xqcia' (Qualcomm uC Arithmetic Extension) // CHECK-NEXT: xqciac 0.2 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension) +// CHECK-NEXT: xqcicli 0.2 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) // CHECK-NEXT: xqcics 0.2 'Xqcics' (Qualcomm uC Conditional Select Extension) // CHECK-NEXT: xqcicsr 0.2 'Xqcicsr' (Qualcomm uC CSR Extension) // CHECK-NEXT: xqcilsm 0.2 'Xqcilsm' (Qualcomm uC Load Store Multiple Extension) diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 22600f5720553..eaaad6c516818 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -432,6 +432,9 @@ The current vendor extensions supported are: ``experimental-Xqciac`` LLVM implements `version 0.2 of the Qualcomm uC Load-Store Address Calculation extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. +``experimental-Xqcicli`` + LLVM implements `version 0.2 of the Qualcomm uC Conditional Load Immediate extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. + ``experimental-Xqcics`` LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 99a93b0467602..be62a7e8696b4 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -230,6 +230,8 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqcilsm` (Load Store Multiple) extension. +* Adds experimental assembler support for the Qualcomm uC 'Xqcicli` (Conditional Load Immediate) + extension. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 57443d3f38e3c..30122831767f6 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -695,6 +695,9 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, TRY_TO_DECODE_FEATURE( RISCV::FeatureVendorXqciac, DecoderTableXqciac32, "Qualcomm uC Load-Store Address Calculation custom opcode table"); + TRY_TO_DECODE_FEATURE( + RISCV::FeatureVendorXqcicli, DecoderTableXqcicli32, + "Qualcomm uC Conditional Load Immediate custom opcode table"); TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table"); return MCDisassembler::Fail; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 916b140c5bde7..3885b95a8937a 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1282,6 +1282,14 @@ def HasVendorXqciac AssemblerPredicate<(all_of FeatureVendorXqciac), "'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)">; +def FeatureVendorXqcicli + : RISCVExperimentalExtension<0, 2, + "Qualcomm uC Conditional Load Immediate Extension">; +def HasVendorXqcicli + : Predicate<"Subtarget->hasVendorXqcicli()">, + AssemblerPredicate<(all_of FeatureVendorXqcicli), + "'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index ca73fbccd9d2d..5e6722cb4995e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -139,6 +139,17 @@ class QCIStoreMultiple funct2, DAGOperand InTyRs2, string opcodestr> let Inst{31-25} = {funct2, imm{6-2}}; } +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class QCILICC funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodestr> + : RVInstRBase { + let Constraints = "$rd = $rd_wb"; + bits<5> simm; + + let Inst{31-25} = {simm, funct2}; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -243,6 +254,22 @@ let Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" in { def QC_LWMI : QCILoadMultiple<0b01, uimm5nonzero, "qc.lwmi">; } // Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" +let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in { + def QC_LIEQ : QCILICC<0b000, 0b01, GPRNoX0, "qc.lieq">; + def QC_LINE : QCILICC<0b001, 0b01, GPRNoX0, "qc.line">; + def QC_LILT : QCILICC<0b100, 0b01, GPRNoX0, "qc.lilt">; + def QC_LIGE : QCILICC<0b101, 0b01, GPRNoX0, "qc.lige">; + def QC_LILTU : QCILICC<0b110, 0b01, GPRNoX0, "qc.liltu">; + def QC_LIGEU : QCILICC<0b111, 0b01, GPRNoX0, "qc.ligeu">; + + def QC_LIEQI : QCILICC<0b000, 0b11, simm5, "qc.lieqi">; + def QC_LINEI : QCILICC<0b001, 0b11, simm5, "qc.linei">; + def QC_LILTI : QCILICC<0b100, 0b11, simm5, "qc.lilti">; + def QC_LIGEI : QCILICC<0b101, 0b11, simm5, "qc.ligei">; + def QC_LILTUI : QCILICC<0b110, 0b11, uimm5, "qc.liltui">; + def QC_LIGEUI : QCILICC<0b111, 0b11, uimm5, "qc.ligeui">; +} // Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" + //===----------------------------------------------------------------------===// // Aliases //===----------------------------------------------------------------------===// diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index e4e459a77b5f8..4f403e9fb6f57 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -742,7 +742,8 @@ Error RISCVISAInfo::checkDependency() { bool HasZvl = MinVLen != 0; bool HasZcmt = Exts.count("zcmt") != 0; static constexpr StringLiteral XqciExts[] = { - {"xqcia"}, {"xqciac"}, {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; + {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcics"}, + {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; if (HasI && HasE) return getIncompatibleError("i", "e"); diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 3f2b2c9470783..bcf945470d85b 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -83,6 +83,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciac %s -o - | FileCheck --check-prefix=RV32XQCIAC %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli %s -o - | FileCheck --check-prefix=RV32XQCICLI %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s @@ -393,6 +394,7 @@ ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2" ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2" ; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2" +; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p2" ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2" ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2" ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2" diff --git a/llvm/test/MC/RISCV/xqcicli-invalid.s b/llvm/test/MC/RISCV/xqcicli-invalid.s new file mode 100644 index 0000000000000..7ee92ec4cbc01 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcicli-invalid.s @@ -0,0 +1,232 @@ +# Xqcicli - Qualcomm uC Conditional Load Immediate Instructions +# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicli < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-PLUS %s +# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicli < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-MINUS %s + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.lieq x0, x4, x6, 10 + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.lieq x2, x0, x6, 10 + +# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.lieq x2, x4, x0, 10 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lieq x2, x4, x6 + +# CHECK-PLUS: :[[@LINE+1]]:21: error: immediate must be an integer in the range [-16, 15] +qc.lieq x2, x4, x6, 40 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lieq x2, x4, x6, 10 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.lige x0, x8, x20, 2 + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.lige x4, x0, x20, 2 + +# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.lige x4, x8, x0, 2 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lige x4, x8, x20 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.lige x4, x8, x20, -18 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lige x4, x8, x20, 2 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.lilt x0, x9, x10, 3 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.lilt x19, x0, x10, 3 + +# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction +qc.lilt x19, x9, x0, 3 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lilt x19, x9, x10 + +# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15] +qc.lilt x19, x9, x10, 39 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lilt x19, x9, x10, 3 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.line x0, x14, x6, 10 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.line x18, x0, x6, 10 + +# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction +qc.line x18, x14, x0, 10 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.line x18, x14, x6 + +# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15] +qc.line x18, x14, x6, 100 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.line x18, x14, x6, 10 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.ligeu x0, x4, x6, 10 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.ligeu x2, x0, x6, 10 + +# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction +qc.ligeu x2, x4, x0, 10 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.ligeu x2, x4, x6 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.ligeu x2, x4, x6, 70 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.ligeu x2, x4, x6, 10 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.liltu x0, x19, x12, 13 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.liltu x1, x0, x12, 13 + +# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction +qc.liltu x1, x19, x0, 13 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.liltu x1, x19, x12 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.liltu x1, x19, x12, 73 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.liltu x1, x19, x12, 13 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.lieqi x0, x1, 15, 12 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.lieqi x7, x0, 15, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lieqi x7, x1, 15 + +# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15] +qc.lieqi x7, x1, 25, 12 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.lieqi x7, x1, 15, -22 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lieqi x7, x1, 15, 12 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.ligei x0, x11, -4, 9 + +# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.ligei x17, x0, -4, 9 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.ligei x17, x11, -4 + +# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [-16, 15] +qc.ligei x17, x11, -24, 9 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.ligei x17, x11, -4, 59 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.ligei x17, x11, -4, 9 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.lilti x0, x11, -14, 2 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.lilti x9, x0, -14, 2 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.lilti x9, x11, -14 + +# CHECK-PLUS: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15] +qc.lilti x9, x11, -84, 2 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.lilti x9, x11, -14, 52 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.lilti x9, x11, -14, 2 + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.linei x0, x1, 10, 12 + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.linei x5, x0, 10, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.linei x5, x1, 10 + +# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15] +qc.linei x5, x1, 130, 12 + +# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15] +qc.linei x5, x1, 10, 124 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.linei x5, x1, 10, 12 + + +# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction +qc.ligeui x0, x12, 7, -12 + +# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.ligeui x2, x0, 7, -12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.ligeui x2, x12, 7 + +# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31] +qc.ligeui x2, x12, -7, -12 + +# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15] +qc.ligeui x2, x12, 7, -17 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.ligeui x2, x12, 7, -12 + + +# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction +qc.liltui x0, x25, 31, 12 + +# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.liltui x3, x0, 31, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.liltui x3, x25, 31 + +# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31] +qc.liltui x3, x25, 32, 12 + +# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15] +qc.liltui x3, x25, 31, 112 + +# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension) +qc.liltui x3, x25, 31, 12 diff --git a/llvm/test/MC/RISCV/xqcicli-valid.s b/llvm/test/MC/RISCV/xqcicli-valid.s new file mode 100644 index 0000000000000..404bfdf7bce26 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcicli-valid.s @@ -0,0 +1,59 @@ +# Xqcicli - Qualcomm uC Conditional Load Immediate Extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcicli -M no-aliases --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcicli --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# CHECK-INST: qc.lieq sp, tp, t1, 10 +# CHECK-ENC: encoding: [0x5b,0x01,0x62,0x52] +qc.lieq x2, x4, x6, 10 + +# CHECK-INST: qc.lieqi t2, ra, 15, 12 +# CHECK-ENC: encoding: [0xdb,0x83,0xf0,0x66] +qc.lieqi x7, x1, 15, 12 + +# CHECK-INST: qc.lige tp, s0, s4, 2 +# CHECK-ENC: encoding: [0x5b,0x52,0x44,0x13] +qc.lige x4, x8, x20, 2 + +# CHECK-INST: qc.ligei a7, a1, -4, 9 +# CHECK-ENC: encoding: [0xdb,0xd8,0xc5,0x4f] +qc.ligei x17, x11, -4, 9 + +# CHECK-INST: qc.ligeu sp, tp, t1, 10 +# CHECK-ENC: encoding: [0x5b,0x71,0x62,0x52] +qc.ligeu x2, x4, x6, 10 + +# CHECK-INST: qc.ligeui sp, a2, 7, -12 +# CHECK-ENC: encoding: [0x5b,0x71,0x76,0xa6] +qc.ligeui x2, x12, 7, -12 + +# CHECK-INST: qc.lilt s3, s1, a0, 3 +# CHECK-ENC: encoding: [0xdb,0xc9,0xa4,0x1a] +qc.lilt x19, x9, x10, 3 + +# CHECK-INST: qc.lilti s1, a1, -14, 2 +# CHECK-ENC: encoding: [0xdb,0xc4,0x25,0x17] +qc.lilti x9, x11, -14, 2 + +# CHECK-INST: qc.liltu ra, s3, a2, 13 +# CHECK-ENC: encoding: [0xdb,0xe0,0xc9,0x6a] +qc.liltu x1, x19, x12, 13 + +# CHECK-INST: qc.liltui gp, s9, 31, 12 +# CHECK-ENC: encoding: [0xdb,0xe1,0xfc,0x67] +qc.liltui x3, x25, 31, 12 + +# CHECK-INST: qc.line s2, a4, t1, 10 +# CHECK-ENC: encoding: [0x5b,0x19,0x67,0x52] +qc.line x18, x14, x6, 10 + +# CHECK-INST: qc.linei t0, ra, 10, 12 +# CHECK-ENC: encoding: [0xdb,0x92,0xa0,0x66] +qc.linei x5, x1, 10, 12 diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 176cf82ac34b1..f631f26cf482e 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -655,7 +655,8 @@ TEST(ParseArchString, RejectsConflictingExtensions) { for (StringRef Input : {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2", - "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcics0p2"}) { + "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcics0p2", + "rv64i_xqcicli0p2"}) { EXPECT_THAT( toString(RISCVISAInfo::parseArchString(Input, true).takeError()), ::testing::EndsWith(" is only supported for 'rv32'")); @@ -1114,6 +1115,7 @@ Experimental extensions svukte 0.3 xqcia 0.2 xqciac 0.2 + xqcicli 0.2 xqcics 0.2 xqcicsr 0.2 xqcilsm 0.2 From 8e404509cc130d95f09f255649a87446ca81b187 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 3 Jan 2025 09:07:44 +0800 Subject: [PATCH 039/480] [JITLink][RISCV] Add feature relax for addsub tests. NFC (#121204) R_RISCV_{ADD*/SUB*} relocations are kept only when feature relax enabled. So it is better to add relax to the test, so that relocs can be reserved for processing by the jitlink. That's what this test really wants to test. --- .../JITLink/RISCV/{riscv_reloc_add.s => ELF_reloc_add.s} | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) rename llvm/test/ExecutionEngine/JITLink/RISCV/{riscv_reloc_add.s => ELF_reloc_add.s} (82%) diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s similarity index 82% rename from llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s rename to llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s index 13689b6d8a026..01f9e7eb5653d 100644 --- a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s +++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s @@ -1,6 +1,8 @@ # RUN: rm -rf %t && mkdir -p %t -# RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t/riscv64_reloc_add.o %s -# RUN: llvm-mc -triple=riscv32 -filetype=obj -o %t/riscv32_reloc_add.o %s +# RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj \ +# RUN: -o %t/riscv64_reloc_add.o %s +# RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj \ +# RUN: -o %t/riscv32_reloc_add.o %s # RUN: llvm-jitlink -noexec -check %s %t/riscv64_reloc_add.o \ # RUN: -slab-allocate=1Mb -slab-address=0x1000 -slab-page-size=0x1000 # RUN: llvm-jitlink -noexec -check %s %t/riscv32_reloc_add.o \ From 3792b36234b6c87d728f0a905543e284bf961460 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 2 Jan 2025 17:08:18 -0800 Subject: [PATCH 040/480] [lld][WebAssembly] Replace config-> with ctx.arg. Change the global variable reference to a member access of another variable `ctx`. In the future, we may pass through `ctx` to functions to eliminate global variables. Pull Request: https://github.com/llvm/llvm-project/pull/119835 --- lld/wasm/Config.h | 10 +- lld/wasm/Driver.cpp | 306 ++++++++++++++++----------------- lld/wasm/InputChunks.cpp | 8 +- lld/wasm/InputChunks.h | 4 +- lld/wasm/InputElement.h | 4 +- lld/wasm/InputFiles.cpp | 14 +- lld/wasm/InputFiles.h | 2 +- lld/wasm/LTO.cpp | 80 ++++----- lld/wasm/MapFile.cpp | 6 +- lld/wasm/MarkLive.cpp | 10 +- lld/wasm/OutputSections.cpp | 6 +- lld/wasm/Relocations.cpp | 20 +-- lld/wasm/SymbolTable.cpp | 28 +-- lld/wasm/Symbols.cpp | 6 +- lld/wasm/Symbols.h | 2 +- lld/wasm/SyntheticSections.cpp | 56 +++--- lld/wasm/SyntheticSections.h | 14 +- lld/wasm/Writer.cpp | 182 ++++++++++---------- 18 files changed, 374 insertions(+), 384 deletions(-) diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index 0c2ba3eebffc4..1fa6c42d9cd86 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -126,17 +126,9 @@ struct Config { llvm::SmallVector buildIdVector; }; -struct ConfigWrapper { - Config c; - Config *operator->() { return &c; } -}; - -// The only instance of Configuration struct. -extern ConfigWrapper config; - // The Ctx object hold all other (non-configuration) global state. struct Ctx { - Config &arg; + Config arg; llvm::SmallVector objectFiles; llvm::SmallVector stubFiles; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 02471950fb519..c3a74dde6480e 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -44,17 +44,16 @@ using namespace llvm::sys; using namespace llvm::wasm; namespace lld::wasm { -ConfigWrapper config; Ctx ctx; void errorOrWarn(const llvm::Twine &msg) { - if (config->noinhibitExec) + if (ctx.arg.noinhibitExec) warn(msg); else error(msg); } -Ctx::Ctx() : arg(config.c) {} +Ctx::Ctx() {} void Ctx::reset() { arg.~Config(); @@ -268,7 +267,7 @@ opt::InputArgList WasmOptTable::parse(ArrayRef argv) { static void readImportFile(StringRef filename) { if (std::optional buf = readFile(filename)) for (StringRef sym : args::getLines(*buf)) - config->allowUndefinedSymbols.insert(sym); + ctx.arg.allowUndefinedSymbols.insert(sym); } // Returns slices of MB by parsing MB as an archive file. @@ -345,7 +344,7 @@ void LinkerDriver::addFile(StringRef path) { case file_magic::bitcode: case file_magic::wasm_object: { auto obj = createObjectFile(mbref, "", 0, inLib); - if (config->isStatic && isa(obj)) { + if (ctx.arg.isStatic && isa(obj)) { error("attempted static link of dynamic object " + path); break; } @@ -364,7 +363,7 @@ void LinkerDriver::addFile(StringRef path) { } static std::optional findFromSearchPaths(StringRef path) { - for (StringRef dir : config->searchPaths) + for (StringRef dir : ctx.arg.searchPaths) if (std::optional s = findFile(dir, path)) return s; return std::nullopt; @@ -373,8 +372,8 @@ static std::optional findFromSearchPaths(StringRef path) { // This is for -l. We'll look for lib.a from // search paths. static std::optional searchLibraryBaseName(StringRef name) { - for (StringRef dir : config->searchPaths) { - if (!config->isStatic) + for (StringRef dir : ctx.arg.searchPaths) { + if (!ctx.arg.isStatic) if (std::optional s = findFile(dir, "lib" + name + ".so")) return s; if (std::optional s = findFile(dir, "lib" + name + ".a")) @@ -408,10 +407,10 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { addFile(arg->getValue()); break; case OPT_Bstatic: - config->isStatic = true; + ctx.arg.isStatic = true; break; case OPT_Bdynamic: - config->isStatic = false; + ctx.arg.isStatic = false; break; case OPT_whole_archive: inWholeArchive = true; @@ -527,99 +526,98 @@ getBuildId(opt::InputArgList &args) { // Initializes Config members by the command line options. static void readConfigs(opt::InputArgList &args) { - config->allowMultipleDefinition = + ctx.arg.allowMultipleDefinition = hasZOption(args, "muldefs") || args.hasFlag(OPT_allow_multiple_definition, OPT_no_allow_multiple_definition, false); - config->bsymbolic = args.hasArg(OPT_Bsymbolic); - config->checkFeatures = + ctx.arg.bsymbolic = args.hasArg(OPT_Bsymbolic); + ctx.arg.checkFeatures = args.hasFlag(OPT_check_features, OPT_no_check_features, true); - config->compressRelocations = args.hasArg(OPT_compress_relocations); - config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); - config->disableVerify = args.hasArg(OPT_disable_verify); - config->emitRelocs = args.hasArg(OPT_emit_relocs); - config->experimentalPic = args.hasArg(OPT_experimental_pic); - config->entry = getEntry(args); - config->exportAll = args.hasArg(OPT_export_all); - config->exportTable = args.hasArg(OPT_export_table); - config->growableTable = args.hasArg(OPT_growable_table); - config->noinhibitExec = args.hasArg(OPT_noinhibit_exec); + ctx.arg.compressRelocations = args.hasArg(OPT_compress_relocations); + ctx.arg.demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); + ctx.arg.disableVerify = args.hasArg(OPT_disable_verify); + ctx.arg.emitRelocs = args.hasArg(OPT_emit_relocs); + ctx.arg.experimentalPic = args.hasArg(OPT_experimental_pic); + ctx.arg.entry = getEntry(args); + ctx.arg.exportAll = args.hasArg(OPT_export_all); + ctx.arg.exportTable = args.hasArg(OPT_export_table); + ctx.arg.growableTable = args.hasArg(OPT_growable_table); + ctx.arg.noinhibitExec = args.hasArg(OPT_noinhibit_exec); if (args.hasArg(OPT_import_memory_with_name)) { - config->memoryImport = + ctx.arg.memoryImport = args.getLastArgValue(OPT_import_memory_with_name).split(","); } else if (args.hasArg(OPT_import_memory)) { - config->memoryImport = + ctx.arg.memoryImport = std::pair(defaultModule, memoryName); } else { - config->memoryImport = + ctx.arg.memoryImport = std::optional>(); } if (args.hasArg(OPT_export_memory_with_name)) { - config->memoryExport = - args.getLastArgValue(OPT_export_memory_with_name); + ctx.arg.memoryExport = args.getLastArgValue(OPT_export_memory_with_name); } else if (args.hasArg(OPT_export_memory)) { - config->memoryExport = memoryName; + ctx.arg.memoryExport = memoryName; } else { - config->memoryExport = std::optional(); + ctx.arg.memoryExport = std::optional(); } - config->sharedMemory = args.hasArg(OPT_shared_memory); - config->soName = args.getLastArgValue(OPT_soname); - config->importTable = args.hasArg(OPT_import_table); - config->importUndefined = args.hasArg(OPT_import_undefined); - config->ltoo = args::getInteger(args, OPT_lto_O, 2); - if (config->ltoo > 3) - error("invalid optimization level for LTO: " + Twine(config->ltoo)); + ctx.arg.sharedMemory = args.hasArg(OPT_shared_memory); + ctx.arg.soName = args.getLastArgValue(OPT_soname); + ctx.arg.importTable = args.hasArg(OPT_import_table); + ctx.arg.importUndefined = args.hasArg(OPT_import_undefined); + ctx.arg.ltoo = args::getInteger(args, OPT_lto_O, 2); + if (ctx.arg.ltoo > 3) + error("invalid optimization level for LTO: " + Twine(ctx.arg.ltoo)); unsigned ltoCgo = - args::getInteger(args, OPT_lto_CGO, args::getCGOptLevel(config->ltoo)); + args::getInteger(args, OPT_lto_CGO, args::getCGOptLevel(ctx.arg.ltoo)); if (auto level = CodeGenOpt::getLevel(ltoCgo)) - config->ltoCgo = *level; + ctx.arg.ltoCgo = *level; else error("invalid codegen optimization level for LTO: " + Twine(ltoCgo)); - config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); - config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); - config->ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager); - config->mapFile = args.getLastArgValue(OPT_Map); - config->optimize = args::getInteger(args, OPT_O, 1); - config->outputFile = args.getLastArgValue(OPT_o); - config->relocatable = args.hasArg(OPT_relocatable); - config->gcSections = - args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, !config->relocatable); + ctx.arg.ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); + ctx.arg.ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); + ctx.arg.ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager); + ctx.arg.mapFile = args.getLastArgValue(OPT_Map); + ctx.arg.optimize = args::getInteger(args, OPT_O, 1); + ctx.arg.outputFile = args.getLastArgValue(OPT_o); + ctx.arg.relocatable = args.hasArg(OPT_relocatable); + ctx.arg.gcSections = + args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, !ctx.arg.relocatable); for (auto *arg : args.filtered(OPT_keep_section)) - config->keepSections.insert(arg->getValue()); - config->mergeDataSegments = + ctx.arg.keepSections.insert(arg->getValue()); + ctx.arg.mergeDataSegments = args.hasFlag(OPT_merge_data_segments, OPT_no_merge_data_segments, - !config->relocatable); - config->pie = args.hasFlag(OPT_pie, OPT_no_pie, false); - config->printGcSections = + !ctx.arg.relocatable); + ctx.arg.pie = args.hasFlag(OPT_pie, OPT_no_pie, false); + ctx.arg.printGcSections = args.hasFlag(OPT_print_gc_sections, OPT_no_print_gc_sections, false); - config->saveTemps = args.hasArg(OPT_save_temps); - config->searchPaths = args::getStrings(args, OPT_library_path); - config->shared = args.hasArg(OPT_shared); - config->shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); - config->stripAll = args.hasArg(OPT_strip_all); - config->stripDebug = args.hasArg(OPT_strip_debug); - config->stackFirst = args.hasArg(OPT_stack_first); - config->trace = args.hasArg(OPT_trace); - config->thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); - config->thinLTOCachePolicy = CHECK( + ctx.arg.saveTemps = args.hasArg(OPT_save_temps); + ctx.arg.searchPaths = args::getStrings(args, OPT_library_path); + ctx.arg.shared = args.hasArg(OPT_shared); + ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); + ctx.arg.stripAll = args.hasArg(OPT_strip_all); + ctx.arg.stripDebug = args.hasArg(OPT_strip_debug); + ctx.arg.stackFirst = args.hasArg(OPT_stack_first); + ctx.arg.trace = args.hasArg(OPT_trace); + ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); + ctx.arg.thinLTOCachePolicy = CHECK( parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)), "--thinlto-cache-policy: invalid cache policy"); - config->thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files); - config->thinLTOEmitIndexFiles = args.hasArg(OPT_thinlto_emit_index_files) || + ctx.arg.thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files); + ctx.arg.thinLTOEmitIndexFiles = args.hasArg(OPT_thinlto_emit_index_files) || args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); - config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || + ctx.arg.thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); - config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); - config->thinLTOObjectSuffixReplace = + ctx.arg.thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); + ctx.arg.thinLTOObjectSuffixReplace = getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq); - std::tie(config->thinLTOPrefixReplaceOld, config->thinLTOPrefixReplaceNew, - config->thinLTOPrefixReplaceNativeObject) = + std::tie(ctx.arg.thinLTOPrefixReplaceOld, ctx.arg.thinLTOPrefixReplaceNew, + ctx.arg.thinLTOPrefixReplaceNativeObject) = getOldNewOptionsExtra(args, OPT_thinlto_prefix_replace_eq); - if (config->thinLTOEmitIndexFiles && !config->thinLTOIndexOnly) { + if (ctx.arg.thinLTOEmitIndexFiles && !ctx.arg.thinLTOIndexOnly) { if (args.hasArg(OPT_thinlto_object_suffix_replace_eq)) error("--thinlto-object-suffix-replace is not supported with " "--thinlto-emit-index-files"); @@ -627,45 +625,45 @@ static void readConfigs(opt::InputArgList &args) { error("--thinlto-prefix-replace is not supported with " "--thinlto-emit-index-files"); } - if (!config->thinLTOPrefixReplaceNativeObject.empty() && - config->thinLTOIndexOnlyArg.empty()) { + if (!ctx.arg.thinLTOPrefixReplaceNativeObject.empty() && + ctx.arg.thinLTOIndexOnlyArg.empty()) { error("--thinlto-prefix-replace=old_dir;new_dir;obj_dir must be used with " "--thinlto-index-only="); } - config->unresolvedSymbols = getUnresolvedSymbolPolicy(args); - config->whyExtract = args.getLastArgValue(OPT_why_extract); + ctx.arg.unresolvedSymbols = getUnresolvedSymbolPolicy(args); + ctx.arg.whyExtract = args.getLastArgValue(OPT_why_extract); errorHandler().verbose = args.hasArg(OPT_verbose); LLVM_DEBUG(errorHandler().verbose = true); - config->tableBase = args::getInteger(args, OPT_table_base, 0); - config->globalBase = args::getInteger(args, OPT_global_base, 0); - config->initialHeap = args::getInteger(args, OPT_initial_heap, 0); - config->initialMemory = args::getInteger(args, OPT_initial_memory, 0); - config->maxMemory = args::getInteger(args, OPT_max_memory, 0); - config->noGrowableMemory = args.hasArg(OPT_no_growable_memory); - config->zStackSize = + ctx.arg.tableBase = args::getInteger(args, OPT_table_base, 0); + ctx.arg.globalBase = args::getInteger(args, OPT_global_base, 0); + ctx.arg.initialHeap = args::getInteger(args, OPT_initial_heap, 0); + ctx.arg.initialMemory = args::getInteger(args, OPT_initial_memory, 0); + ctx.arg.maxMemory = args::getInteger(args, OPT_max_memory, 0); + ctx.arg.noGrowableMemory = args.hasArg(OPT_no_growable_memory); + ctx.arg.zStackSize = args::getZOptionValue(args, OPT_z, "stack-size", WasmPageSize); // -Bdynamic by default if -pie or -shared is specified. - if (config->pie || config->shared) - config->isStatic = false; + if (ctx.arg.pie || ctx.arg.shared) + ctx.arg.isStatic = false; - if (config->maxMemory != 0 && config->noGrowableMemory) { + if (ctx.arg.maxMemory != 0 && ctx.arg.noGrowableMemory) { // Erroring out here is simpler than defining precedence rules. error("--max-memory is incompatible with --no-growable-memory"); } // Default value of exportDynamic depends on `-shared` - config->exportDynamic = - args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, config->shared); + ctx.arg.exportDynamic = + args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, ctx.arg.shared); // Parse wasm32/64. if (auto *arg = args.getLastArg(OPT_m)) { StringRef s = arg->getValue(); if (s == "wasm32") - config->is64 = false; + ctx.arg.is64 = false; else if (s == "wasm64") - config->is64 = true; + ctx.arg.is64 = true; else error("invalid target architecture: " + s); } @@ -679,36 +677,36 @@ static void readConfigs(opt::InputArgList &args) { error(arg->getSpelling() + ": expected a positive integer, but got '" + arg->getValue() + "'"); parallel::strategy = hardware_concurrency(threads); - config->thinLTOJobs = v; + ctx.arg.thinLTOJobs = v; } if (auto *arg = args.getLastArg(OPT_thinlto_jobs)) - config->thinLTOJobs = arg->getValue(); + ctx.arg.thinLTOJobs = arg->getValue(); if (auto *arg = args.getLastArg(OPT_features)) { - config->features = + ctx.arg.features = std::optional>(std::vector()); for (StringRef s : arg->getValues()) - config->features->push_back(std::string(s)); + ctx.arg.features->push_back(std::string(s)); } if (auto *arg = args.getLastArg(OPT_extra_features)) { - config->extraFeatures = + ctx.arg.extraFeatures = std::optional>(std::vector()); for (StringRef s : arg->getValues()) - config->extraFeatures->push_back(std::string(s)); + ctx.arg.extraFeatures->push_back(std::string(s)); } // Legacy --allow-undefined flag which is equivalent to // --unresolve-symbols=ignore + --import-undefined if (args.hasArg(OPT_allow_undefined)) { - config->importUndefined = true; - config->unresolvedSymbols = UnresolvedPolicy::Ignore; + ctx.arg.importUndefined = true; + ctx.arg.unresolvedSymbols = UnresolvedPolicy::Ignore; } if (args.hasArg(OPT_print_map)) - config->mapFile = "-"; + ctx.arg.mapFile = "-"; - std::tie(config->buildId, config->buildIdVector) = getBuildId(args); + std::tie(ctx.arg.buildId, ctx.arg.buildIdVector) = getBuildId(args); } // Some Config members do not directly correspond to any particular @@ -716,86 +714,86 @@ static void readConfigs(opt::InputArgList &args) { // This function initialize such members. See Config.h for the details // of these values. static void setConfigs() { - ctx.isPic = config->pie || config->shared; + ctx.isPic = ctx.arg.pie || ctx.arg.shared; if (ctx.isPic) { - if (config->exportTable) + if (ctx.arg.exportTable) error("-shared/-pie is incompatible with --export-table"); - config->importTable = true; + ctx.arg.importTable = true; } else { // Default table base. Defaults to 1, reserving 0 for the NULL function // pointer. - if (!config->tableBase) - config->tableBase = 1; + if (!ctx.arg.tableBase) + ctx.arg.tableBase = 1; // The default offset for static/global data, for when --global-base is // not specified on the command line. The precise value of 1024 is // somewhat arbitrary, and pre-dates wasm-ld (Its the value that // emscripten used prior to wasm-ld). - if (!config->globalBase && !config->relocatable && !config->stackFirst) - config->globalBase = 1024; + if (!ctx.arg.globalBase && !ctx.arg.relocatable && !ctx.arg.stackFirst) + ctx.arg.globalBase = 1024; } - if (config->relocatable) { - if (config->exportTable) + if (ctx.arg.relocatable) { + if (ctx.arg.exportTable) error("--relocatable is incompatible with --export-table"); - if (config->growableTable) + if (ctx.arg.growableTable) error("--relocatable is incompatible with --growable-table"); // Ignore any --import-table, as it's redundant. - config->importTable = true; + ctx.arg.importTable = true; } - if (config->shared) { - if (config->memoryExport.has_value()) { + if (ctx.arg.shared) { + if (ctx.arg.memoryExport.has_value()) { error("--export-memory is incompatible with --shared"); } - if (!config->memoryImport.has_value()) { - config->memoryImport = - std::pair(defaultModule, memoryName); + if (!ctx.arg.memoryImport.has_value()) { + ctx.arg.memoryImport = std::pair( + defaultModule, memoryName); } } // If neither export-memory nor import-memory is specified, default to // exporting memory under its default name. - if (!config->memoryExport.has_value() && !config->memoryImport.has_value()) { - config->memoryExport = memoryName; + if (!ctx.arg.memoryExport.has_value() && !ctx.arg.memoryImport.has_value()) { + ctx.arg.memoryExport = memoryName; } } // Some command line options or some combinations of them are not allowed. // This function checks for such errors. static void checkOptions(opt::InputArgList &args) { - if (!config->stripDebug && !config->stripAll && config->compressRelocations) + if (!ctx.arg.stripDebug && !ctx.arg.stripAll && ctx.arg.compressRelocations) error("--compress-relocations is incompatible with output debug" " information. Please pass --strip-debug or --strip-all"); - if (config->ltoPartitions == 0) + if (ctx.arg.ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (!get_threadpool_strategy(config->thinLTOJobs)) - error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs); + if (!get_threadpool_strategy(ctx.arg.thinLTOJobs)) + error("--thinlto-jobs: invalid job count: " + ctx.arg.thinLTOJobs); - if (config->pie && config->shared) + if (ctx.arg.pie && ctx.arg.shared) error("-shared and -pie may not be used together"); - if (config->outputFile.empty() && !config->thinLTOIndexOnly) + if (ctx.arg.outputFile.empty() && !ctx.arg.thinLTOIndexOnly) error("no output file specified"); - if (config->importTable && config->exportTable) + if (ctx.arg.importTable && ctx.arg.exportTable) error("--import-table and --export-table may not be used together"); - if (config->relocatable) { - if (!config->entry.empty()) + if (ctx.arg.relocatable) { + if (!ctx.arg.entry.empty()) error("entry point specified for relocatable output file"); - if (config->gcSections) + if (ctx.arg.gcSections) error("-r and --gc-sections may not be used together"); - if (config->compressRelocations) + if (ctx.arg.compressRelocations) error("-r -and --compress-relocations may not be used together"); if (args.hasArg(OPT_undefined)) error("-r -and --undefined may not be used together"); - if (config->pie) + if (ctx.arg.pie) error("-r and -pie may not be used together"); - if (config->sharedMemory) + if (ctx.arg.sharedMemory) error("-r and --shared-memory may not be used together"); - if (config->globalBase) + if (ctx.arg.globalBase) error("-r and --global-base may not by used together"); } @@ -804,31 +802,31 @@ static void checkOptions(opt::InputArgList &args) { // mode, to give anyone using them a heads-up that they will be changing. // // Also, warn about flags which request explicit exports. - if (!config->experimentalPic) { + if (!ctx.arg.experimentalPic) { // -shared will change meaning when Module Linking is implemented. - if (config->shared) { + if (ctx.arg.shared) { warn("creating shared libraries, with -shared, is not yet stable"); } // -pie will change meaning when Module Linking is implemented. - if (config->pie) { + if (ctx.arg.pie) { warn("creating PIEs, with -pie, is not yet stable"); } - if (config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) { + if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) { warn("dynamic imports are not yet stable " "(--unresolved-symbols=import-dynamic)"); } } - if (config->bsymbolic && !config->shared) { + if (ctx.arg.bsymbolic && !ctx.arg.shared) { warn("-Bsymbolic is only meaningful when combined with -shared"); } if (ctx.isPic) { - if (config->globalBase) + if (ctx.arg.globalBase) error("--global-base may not be used with -shared/-pie"); - if (config->tableBase) + if (ctx.arg.tableBase) error("--table-base may not be used with -shared/-pie"); } } @@ -851,7 +849,7 @@ static Symbol *handleUndefined(StringRef name, const char *option) { if (auto *lazySym = dyn_cast(sym)) { lazySym->extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(option, sym->getFile(), *sym); } @@ -861,20 +859,20 @@ static Symbol *handleUndefined(StringRef name, const char *option) { static void handleLibcall(StringRef name) { Symbol *sym = symtab->find(name); if (sym && sym->isLazy() && isa(sym->getFile())) { - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back("", sym->getFile(), *sym); cast(sym)->extract(); } } static void writeWhyExtract() { - if (config->whyExtract.empty()) + if (ctx.arg.whyExtract.empty()) return; std::error_code ec; - raw_fd_ostream os(config->whyExtract, ec, sys::fs::OF_None); + raw_fd_ostream os(ctx.arg.whyExtract, ec, sys::fs::OF_None); if (ec) { - error("cannot open --why-extract= file " + config->whyExtract + ": " + + error("cannot open --why-extract= file " + ctx.arg.whyExtract + ": " + ec.message()); return; } @@ -905,14 +903,14 @@ static UndefinedGlobal * createUndefinedGlobal(StringRef name, llvm::wasm::WasmGlobalType *type) { auto *sym = cast(symtab->addUndefinedGlobal( name, std::nullopt, std::nullopt, WASM_SYMBOL_UNDEFINED, nullptr, type)); - config->allowUndefinedSymbols.insert(sym->getName()); + ctx.arg.allowUndefinedSymbols.insert(sym->getName()); sym->isUsedInRegularObj = true; return sym; } static InputGlobal *createGlobal(StringRef name, bool isMutable) { llvm::wasm::WasmGlobal wasmGlobal; - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); wasmGlobal.Type = {uint8_t(is64 ? WASM_TYPE_I64 : WASM_TYPE_I32), isMutable}; wasmGlobal.InitExpr = intConst(0, is64); wasmGlobal.SymbolName = name; @@ -931,7 +929,7 @@ static GlobalSymbol *createOptionalGlobal(StringRef name, bool isMutable) { // Create ABI-defined synthetic symbols static void createSyntheticSymbols() { - if (config->relocatable) + if (ctx.arg.relocatable) return; static WasmSignature nullSignature = {{}, {}}; @@ -947,11 +945,11 @@ static void createSyntheticSymbols() { "__wasm_call_ctors", WASM_SYMBOL_VISIBILITY_HIDDEN, make(nullSignature, "__wasm_call_ctors")); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); if (ctx.isPic) { WasmSym::stackPointer = - createUndefinedGlobal("__stack_pointer", config->is64.value_or(false) + createUndefinedGlobal("__stack_pointer", ctx.arg.is64.value_or(false) ? &mutableGlobalTypeI64 : &mutableGlobalTypeI32); // For PIC code, we import two global variables (__memory_base and @@ -970,7 +968,7 @@ static void createSyntheticSymbols() { WasmSym::stackPointer->markLive(); } - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { WasmSym::tlsBase = createGlobalVariable("__tls_base", true); WasmSym::tlsSize = createGlobalVariable("__tls_size", false); WasmSym::tlsAlign = createGlobalVariable("__tls_align", false); @@ -983,12 +981,12 @@ static void createSyntheticSymbols() { } static void createOptionalSymbols() { - if (config->relocatable) + if (ctx.arg.relocatable) return; WasmSym::dsoHandle = symtab->addOptionalDataSymbol("__dso_handle"); - if (!config->shared) + if (!ctx.arg.shared) WasmSym::dataEnd = symtab->addOptionalDataSymbol("__data_end"); if (!ctx.isPic) { @@ -1010,7 +1008,7 @@ static void createOptionalSymbols() { // // __tls_size and __tls_align are not needed in this case since they are only // needed for __wasm_init_tls (which we do not create in this case). - if (!config->sharedMemory) + if (!ctx.arg.sharedMemory) WasmSym::tlsBase = createOptionalGlobal("__tls_base", false); } @@ -1035,7 +1033,7 @@ static void processStubLibrariesPreLTO() { // extracted during processStubLibraries, which is too late since // LTO has already being performed at that point. if (needed->isLazy() && isa(needed->getFile())) { - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(stub_file), needed->getFile(), *needed); cast(needed)->extract(); @@ -1079,7 +1077,7 @@ static bool addStubSymbolDeps(const StubFile *stub_file, Symbol *sym, if (auto *lazy = dyn_cast(needed)) { depsAdded = true; lazy->extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(stub_file), sym->getFile(), *sym); } diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp index 9383dcaeb4f55..ccdc92f5c8d71 100644 --- a/lld/wasm/InputChunks.cpp +++ b/lld/wasm/InputChunks.cpp @@ -67,7 +67,7 @@ uint32_t InputChunk::getSize() const { return ms->builder.getSize(); if (const auto *f = dyn_cast(this)) { - if (config->compressRelocations && f->file) { + if (ctx.arg.compressRelocations && f->file) { return f->getCompressedSize(); } } @@ -84,7 +84,7 @@ uint32_t InputChunk::getInputSize() const { // Copy this input chunk to an mmap'ed output file and apply relocations. void InputChunk::writeTo(uint8_t *buf) const { if (const auto *f = dyn_cast(this)) { - if (file && config->compressRelocations) + if (file && ctx.arg.compressRelocations) return f->writeCompressed(buf); } else if (const auto *ms = dyn_cast(this)) { ms->builder.write(buf + outSecOff); @@ -269,7 +269,7 @@ static unsigned getRelocWidth(const WasmRelocation &rel, uint64_t value) { // This function only computes the final output size. It must be called // before getSize() is used to calculate of layout of the code section. void InputFunction::calculateSize() { - if (!file || !config->compressRelocations) + if (!file || !ctx.arg.compressRelocations) return; LLVM_DEBUG(dbgs() << "calculateSize: " << name << "\n"); @@ -365,7 +365,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { LLVM_DEBUG(dbgs() << "generating runtime relocations: " << name << " count=" << relocations.size() << "\n"); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); bool generated = false; unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h index d6769bcf5c823..f545449e1246f 100644 --- a/lld/wasm/InputChunks.h +++ b/lld/wasm/InputChunks.h @@ -112,7 +112,7 @@ class InputChunk { InputChunk(ObjFile *f, Kind k, StringRef name, uint32_t alignment = 0, uint32_t flags = 0) : name(name), file(f), alignment(alignment), flags(flags), sectionKind(k), - live(!config->gcSections), discarded(false) {} + live(!ctx.arg.gcSections), discarded(false) {} ArrayRef data() const { return rawData; } uint64_t getTombstone() const; @@ -156,7 +156,7 @@ class SyntheticMergedChunk; // be found by looking at the next one). struct SectionPiece { SectionPiece(size_t off, uint32_t hash, bool live) - : inputOff(off), live(live || !config->gcSections), hash(hash >> 1) {} + : inputOff(off), live(live || !ctx.arg.gcSections), hash(hash >> 1) {} uint32_t inputOff; uint32_t live : 1; diff --git a/lld/wasm/InputElement.h b/lld/wasm/InputElement.h index 10dc2a3e4a826..c2a24c8ff5f4e 100644 --- a/lld/wasm/InputElement.h +++ b/lld/wasm/InputElement.h @@ -24,7 +24,7 @@ namespace wasm { class InputElement { protected: InputElement(StringRef name, ObjFile *f) - : file(f), live(!config->gcSections), name(name) {} + : file(f), live(!ctx.arg.gcSections), name(name) {} public: StringRef getName() const { return name; } @@ -65,7 +65,7 @@ class InputGlobal : public InputElement { const WasmInitExpr &getInitExpr() const { return initExpr; } void setPointerValue(uint64_t value) { - initExpr = intConst(value, config->is64.value_or(false)); + initExpr = intConst(value, ctx.arg.is64.value_or(false)); } private: diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp index 221f02aa1c157..614cddddd1b19 100644 --- a/lld/wasm/InputFiles.cpp +++ b/lld/wasm/InputFiles.cpp @@ -47,7 +47,7 @@ std::string toString(const wasm::InputFile *file) { namespace wasm { std::string replaceThinLTOSuffix(StringRef path) { - auto [suffix, repl] = config->thinLTOObjectSuffixReplace; + auto [suffix, repl] = ctx.arg.thinLTOObjectSuffixReplace; if (path.consume_back(suffix)) return (path + repl).str(); return std::string(path); @@ -55,10 +55,10 @@ std::string replaceThinLTOSuffix(StringRef path) { void InputFile::checkArch(Triple::ArchType arch) const { bool is64 = arch == Triple::wasm64; - if (is64 && !config->is64) { + if (is64 && !ctx.arg.is64) { fatal(toString(this) + ": must specify -mwasm64 to process wasm64 object files"); - } else if (config->is64.value_or(false) != is64) { + } else if (ctx.arg.is64.value_or(false) != is64) { fatal(toString(this) + ": wasm32 object file can't be linked in wasm64 mode"); } @@ -169,7 +169,7 @@ uint64_t ObjFile::calcNewValue(const WasmRelocation &reloc, uint64_t tombstone, uint32_t index = getFunctionSymbol(reloc.Index)->getTableIndex(); if (reloc.Type == R_WASM_TABLE_INDEX_REL_SLEB || reloc.Type == R_WASM_TABLE_INDEX_REL_SLEB64) - index -= config->tableBase; + index -= ctx.arg.tableBase; return index; } case R_WASM_MEMORY_ADDR_LEB: @@ -360,7 +360,7 @@ void ObjFile::addLegacyIndirectFunctionTableIfNeeded( } static bool shouldMerge(const WasmSection &sec) { - if (config->optimize == 0) + if (ctx.arg.optimize == 0) return false; // Sadly we don't have section attributes yet for custom sections, so we // currently go by the name alone. @@ -383,7 +383,7 @@ static bool shouldMerge(const WasmSegment &seg) { // On a regular link we don't merge sections if -O0 (default is -O1). This // sometimes makes the linker significantly faster, although the output will // be bigger. - if (config->optimize == 0) + if (ctx.arg.optimize == 0) return false; // A mergeable section with size 0 is useless because they don't have @@ -845,7 +845,7 @@ BitcodeFile::BitcodeFile(MemoryBufferRef m, StringRef archiveName, this->archiveName = std::string(archiveName); std::string path = mb.getBufferIdentifier().str(); - if (config->thinLTOIndexOnly) + if (ctx.arg.thinLTOIndexOnly) path = replaceThinLTOSuffix(mb.getBufferIdentifier()); // ThinLTO assumes that all MemoryBufferRefs given to it have a unique diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h index 1b1de98d2d17a..fd7fcb13f4426 100644 --- a/lld/wasm/InputFiles.h +++ b/lld/wasm/InputFiles.h @@ -73,7 +73,7 @@ class InputFile { protected: InputFile(Kind k, MemoryBufferRef m) - : mb(m), fileKind(k), live(!config->gcSections) {} + : mb(m), fileKind(k), live(!ctx.arg.gcSections) {} void checkArch(llvm::Triple::ArchType arch) const; diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp index d9fff748bdb65..b9bd48acd6dc1 100644 --- a/lld/wasm/LTO.cpp +++ b/lld/wasm/LTO.cpp @@ -44,8 +44,8 @@ using namespace lld::wasm; using namespace lld; static std::string getThinLTOOutputFile(StringRef modulePath) { - return lto::getThinLTOOutputFile(modulePath, config->thinLTOPrefixReplaceOld, - config->thinLTOPrefixReplaceNew); + return lto::getThinLTOOutputFile(modulePath, ctx.arg.thinLTOPrefixReplaceOld, + ctx.arg.thinLTOPrefixReplaceNew); } static lto::Config createConfig() { @@ -56,23 +56,23 @@ static lto::Config createConfig() { c.Options.FunctionSections = true; c.Options.DataSections = true; - c.DisableVerify = config->disableVerify; + c.DisableVerify = ctx.arg.disableVerify; c.DiagHandler = diagnosticHandler; - c.OptLevel = config->ltoo; + c.OptLevel = ctx.arg.ltoo; c.MAttrs = getMAttrs(); - c.CGOptLevel = config->ltoCgo; - c.DebugPassManager = config->ltoDebugPassManager; - c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty(); + c.CGOptLevel = ctx.arg.ltoCgo; + c.DebugPassManager = ctx.arg.ltoDebugPassManager; + c.AlwaysEmitRegularLTOObj = !ctx.arg.ltoObjPath.empty(); - if (config->relocatable) + if (ctx.arg.relocatable) c.RelocModel = std::nullopt; else if (ctx.isPic) c.RelocModel = Reloc::PIC_; else c.RelocModel = Reloc::Static; - if (config->saveTemps) - checkError(c.addSaveTemps(config->outputFile.str() + ".", + if (ctx.arg.saveTemps) + checkError(c.addSaveTemps(ctx.arg.outputFile.str() + ".", /*UseInputModulePath*/ true)); return c; } @@ -81,27 +81,27 @@ namespace lld::wasm { BitcodeCompiler::BitcodeCompiler() { // Initialize indexFile. - if (!config->thinLTOIndexOnlyArg.empty()) - indexFile = openFile(config->thinLTOIndexOnlyArg); + if (!ctx.arg.thinLTOIndexOnlyArg.empty()) + indexFile = openFile(ctx.arg.thinLTOIndexOnlyArg); // Initialize ltoObj. lto::ThinBackend backend; auto onIndexWrite = [&](StringRef s) { thinIndices.erase(s); }; - if (config->thinLTOIndexOnly) { + if (ctx.arg.thinLTOIndexOnly) { backend = lto::createWriteIndexesThinBackend( - llvm::hardware_concurrency(config->thinLTOJobs), - std::string(config->thinLTOPrefixReplaceOld), - std::string(config->thinLTOPrefixReplaceNew), - std::string(config->thinLTOPrefixReplaceNativeObject), - config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); + llvm::hardware_concurrency(ctx.arg.thinLTOJobs), + std::string(ctx.arg.thinLTOPrefixReplaceOld), + std::string(ctx.arg.thinLTOPrefixReplaceNew), + std::string(ctx.arg.thinLTOPrefixReplaceNativeObject), + ctx.arg.thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); } else { backend = lto::createInProcessThinBackend( - llvm::heavyweight_hardware_concurrency(config->thinLTOJobs), - onIndexWrite, config->thinLTOEmitIndexFiles, - config->thinLTOEmitImportsFiles); + llvm::heavyweight_hardware_concurrency(ctx.arg.thinLTOJobs), + onIndexWrite, ctx.arg.thinLTOEmitIndexFiles, + ctx.arg.thinLTOEmitImportsFiles); } ltoObj = std::make_unique(createConfig(), backend, - config->ltoPartitions); + ctx.arg.ltoPartitions); } BitcodeCompiler::~BitcodeCompiler() = default; @@ -123,7 +123,7 @@ void BitcodeCompiler::add(BitcodeFile &f) { ArrayRef syms = f.getSymbols(); std::vector resols(syms.size()); - if (config->thinLTOEmitIndexFiles) { + if (ctx.arg.thinLTOEmitIndexFiles) { thinIndices.insert(obj.getName()); } @@ -139,7 +139,7 @@ void BitcodeCompiler::add(BitcodeFile &f) { // Once IRObjectFile is fixed to report only one symbol this hack can // be removed. r.Prevailing = !objSym.isUndefined() && sym->getFile() == &f; - r.VisibleToRegularObj = config->relocatable || sym->isUsedInRegularObj || + r.VisibleToRegularObj = ctx.arg.relocatable || sym->isUsedInRegularObj || sym->isNoStrip() || (r.Prevailing && sym->isExported()); if (r.Prevailing) @@ -175,7 +175,7 @@ static void thinLTOCreateEmptyIndexFiles() { ModuleSummaryIndex m(/*HaveGVs*/ false); m.setSkipModuleByDistributedBackend(); writeIndexToFile(m, *os); - if (config->thinLTOEmitImportsFiles) + if (ctx.arg.thinLTOEmitImportsFiles) openFile(path + ".imports"); } } @@ -191,8 +191,8 @@ std::vector BitcodeCompiler::compile() { // to cache native object files for ThinLTO incremental builds. If a path was // specified, configure LTO to use it as the cache directory. FileCache cache; - if (!config->thinLTOCacheDir.empty()) - cache = check(localCache("ThinLTO", "Thin", config->thinLTOCacheDir, + if (!ctx.arg.thinLTOCacheDir.empty()) + cache = check(localCache("ThinLTO", "Thin", ctx.arg.thinLTOCacheDir, [&](size_t task, const Twine &moduleName, std::unique_ptr mb) { files[task] = std::move(mb); @@ -210,16 +210,16 @@ std::vector BitcodeCompiler::compile() { for (StringRef s : thinIndices) { std::string path(s); openFile(path + ".thinlto.bc"); - if (config->thinLTOEmitImportsFiles) + if (ctx.arg.thinLTOEmitImportsFiles) openFile(path + ".imports"); } - if (config->thinLTOEmitIndexFiles) + if (ctx.arg.thinLTOEmitIndexFiles) thinLTOCreateEmptyIndexFiles(); - if (config->thinLTOIndexOnly) { - if (!config->ltoObjPath.empty()) - saveBuffer(buf[0].second, config->ltoObjPath); + if (ctx.arg.thinLTOIndexOnly) { + if (!ctx.arg.ltoObjPath.empty()) + saveBuffer(buf[0].second, ctx.arg.ltoObjPath); // ThinLTO with index only option is required to generate only the index // files. After that, we exit from linker and ThinLTO backend runs in a @@ -229,8 +229,8 @@ std::vector BitcodeCompiler::compile() { return {}; } - if (!config->thinLTOCacheDir.empty()) - pruneCache(config->thinLTOCacheDir, config->thinLTOCachePolicy, files); + if (!ctx.arg.thinLTOCacheDir.empty()) + pruneCache(ctx.arg.thinLTOCacheDir, ctx.arg.thinLTOCachePolicy, files); std::vector ret; for (unsigned i = 0; i != maxTasks; ++i) { @@ -239,7 +239,7 @@ std::vector BitcodeCompiler::compile() { if (objBuf.empty()) continue; ret.emplace_back(objBuf.data(), objBuf.size()); - if (!config->saveTemps) + if (!ctx.arg.saveTemps) continue; // If the input bitcode file is path/to/x.o and -o specifies a.out, the @@ -248,7 +248,7 @@ std::vector BitcodeCompiler::compile() { StringRef ltoObjName; if (bitcodeFilePath == "ld-temp.o") { ltoObjName = - saver().save(Twine(config->outputFile) + ".lto" + + saver().save(Twine(ctx.arg.outputFile) + ".lto" + (i == 0 ? Twine("") : Twine('.') + Twine(i)) + ".o"); } else { StringRef directory = sys::path::parent_path(bitcodeFilePath); @@ -258,7 +258,7 @@ std::vector BitcodeCompiler::compile() { StringRef baseName = bitcodeFilePath.ends_with(")") ? sys::path::filename(bitcodeFilePath) : sys::path::stem(bitcodeFilePath); - StringRef outputFileBaseName = sys::path::filename(config->outputFile); + StringRef outputFileBaseName = sys::path::filename(ctx.arg.outputFile); SmallString<256> path; sys::path::append(path, directory, outputFileBaseName + ".lto." + baseName + ".o"); @@ -268,10 +268,10 @@ std::vector BitcodeCompiler::compile() { saveBuffer(objBuf, ltoObjName); } - if (!config->ltoObjPath.empty()) { - saveBuffer(buf[0].second, config->ltoObjPath); + if (!ctx.arg.ltoObjPath.empty()) { + saveBuffer(buf[0].second, ctx.arg.ltoObjPath); for (unsigned i = 1; i != maxTasks; ++i) - saveBuffer(buf[i].second, config->ltoObjPath + Twine(i)); + saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i)); } for (std::unique_ptr &file : files) diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp index c96b64cb64838..d8487e48b8c6b 100644 --- a/lld/wasm/MapFile.cpp +++ b/lld/wasm/MapFile.cpp @@ -103,14 +103,14 @@ getSymbolStrings(ArrayRef syms) { } void lld::wasm::writeMapFile(ArrayRef outputSections) { - if (config->mapFile.empty()) + if (ctx.arg.mapFile.empty()) return; // Open a map file for writing. std::error_code ec; - raw_fd_ostream os(config->mapFile, ec, sys::fs::OF_None); + raw_fd_ostream os(ctx.arg.mapFile, ec, sys::fs::OF_None); if (ec) { - error("cannot open " + config->mapFile + ": " + ec.message()); + error("cannot open " + ctx.arg.mapFile + ": " + ec.message()); return; } diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp index 1b99f03747fb0..13c7a3d894fe3 100644 --- a/lld/wasm/MarkLive.cpp +++ b/lld/wasm/MarkLive.cpp @@ -106,8 +106,8 @@ void MarkLive::enqueueRetainedSegments(const ObjFile *file) { void MarkLive::run() { // Add GC root symbols. - if (!config->entry.empty()) - enqueue(symtab->find(config->entry)); + if (!ctx.arg.entry.empty()) + enqueue(symtab->find(ctx.arg.entry)); // We need to preserve any no-strip or exported symbol for (Symbol *sym : symtab->symbols()) @@ -166,7 +166,7 @@ void MarkLive::mark() { } void markLive() { - if (!config->gcSections) + if (!ctx.arg.gcSections) return; LLVM_DEBUG(dbgs() << "markLive\n"); @@ -175,7 +175,7 @@ void markLive() { marker.run(); // Report garbage-collected sections. - if (config->printGcSections) { + if (ctx.arg.printGcSections) { for (const ObjFile *obj : ctx.objectFiles) { for (InputChunk *c : obj->functions) if (!c->live) @@ -207,7 +207,7 @@ void markLive() { bool MarkLive::isCallCtorsLive() { // In a reloctable link, we don't call `__wasm_call_ctors`. - if (config->relocatable) + if (ctx.arg.relocatable) return false; // In Emscripten-style PIC, we call `__wasm_call_ctors` which calls diff --git a/lld/wasm/OutputSections.cpp b/lld/wasm/OutputSections.cpp index e4f75829ec4c3..95f7ecc29de6b 100644 --- a/lld/wasm/OutputSections.cpp +++ b/lld/wasm/OutputSections.cpp @@ -105,13 +105,13 @@ void DataSection::finalizeContents() { }); #endif - assert((config->sharedMemory || !ctx.isPic || config->extendedConst || + assert((ctx.arg.sharedMemory || !ctx.isPic || ctx.arg.extendedConst || activeCount <= 1) && "output segments should have been combined by now"); writeUleb128(os, segmentCount, "data segment count"); bodySize = dataSectionHeader.size(); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); for (OutputSegment *segment : segments) { if (!segment->requiredInBinary()) @@ -121,7 +121,7 @@ void DataSection::finalizeContents() { if (segment->initFlags & WASM_DATA_SEGMENT_HAS_MEMINDEX) writeUleb128(os, 0, "memory index"); if ((segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE) == 0) { - if (ctx.isPic && config->extendedConst) { + if (ctx.isPic && ctx.arg.extendedConst) { writeU8(os, WASM_OPCODE_GLOBAL_GET, "global get"); writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(), "literal (global index)"); diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 45ad32701616a..745dfde76ab70 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -22,13 +22,13 @@ static bool requiresGOTAccess(const Symbol *sym) { if (sym->isShared()) return true; if (!ctx.isPic && - config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic) + ctx.arg.unresolvedSymbols != UnresolvedPolicy::ImportDynamic) return false; if (sym->isHidden() || sym->isLocal()) return false; // With `-Bsymbolic` (or when building an executable) as don't need to use // the GOT for symbols that are defined within the current module. - if (sym->isDefined() && (!config->shared || config->bsymbolic)) + if (sym->isDefined() && (!ctx.arg.shared || ctx.arg.bsymbolic)) return false; return true; } @@ -38,15 +38,15 @@ static bool allowUndefined(const Symbol* sym) { // link time. if (sym->isImported()) return true; - if (isa(sym) && config->importUndefined) + if (isa(sym) && ctx.arg.importUndefined) return true; - return config->allowUndefinedSymbols.count(sym->getName()) != 0; + return ctx.arg.allowUndefinedSymbols.count(sym->getName()) != 0; } static void reportUndefined(ObjFile *file, Symbol *sym) { if (!allowUndefined(sym)) { - switch (config->unresolvedSymbols) { + switch (ctx.arg.unresolvedSymbols) { case UnresolvedPolicy::ReportError: error(toString(file) + ": undefined symbol: " + toString(*sym)); break; @@ -63,8 +63,8 @@ static void reportUndefined(ObjFile *file, Symbol *sym) { if (auto *f = dyn_cast(sym)) { if (!f->stubFunction && - config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic && - !config->importUndefined) { + ctx.arg.unresolvedSymbols != UnresolvedPolicy::ImportDynamic && + !ctx.arg.importUndefined) { f->stubFunction = symtab->createUndefinedStub(*f->getSignature()); f->stubFunction->markLive(); // Mark the function itself as a stub which prevents it from being @@ -125,7 +125,7 @@ void scanRelocations(InputChunk *chunk) { // In single-threaded builds TLS is lowered away and TLS data can be // merged with normal data and allowing TLS relocation in non-TLS // segments. - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { if (!sym->isTLS()) { error(toString(file) + ": relocation " + relocTypeToString(reloc.Type) + @@ -146,7 +146,7 @@ void scanRelocations(InputChunk *chunk) { if (ctx.isPic || (sym->isUndefined() && - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) { + ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) { switch (reloc.Type) { case R_WASM_TABLE_INDEX_SLEB: case R_WASM_TABLE_INDEX_SLEB64: @@ -173,7 +173,7 @@ void scanRelocations(InputChunk *chunk) { } } - if (!config->relocatable && sym->isUndefined()) { + if (!ctx.arg.relocatable && sym->isUndefined()) { switch (reloc.Type) { case R_WASM_TABLE_INDEX_REL_SLEB: case R_WASM_TABLE_INDEX_REL_SLEB64: diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp index 4cbf44b4d0398..f57359083d242 100644 --- a/lld/wasm/SymbolTable.cpp +++ b/lld/wasm/SymbolTable.cpp @@ -53,7 +53,7 @@ void SymbolTable::addFile(InputFile *file, StringRef symName) { return; } - if (config->trace) + if (ctx.arg.trace) message(toString(file)); // LLVM bitcode file @@ -125,7 +125,7 @@ std::pair SymbolTable::insertName(StringRef name) { sym->canInline = true; sym->traced = trace; sym->forceExport = false; - sym->referenced = !config->gcSections; + sym->referenced = !ctx.arg.gcSections; symVector.emplace_back(sym); return {sym, true}; } @@ -235,7 +235,7 @@ DefinedFunction *SymbolTable::addSyntheticFunction(StringRef name, DefinedData *SymbolTable::addOptionalDataSymbol(StringRef name, uint64_t value) { Symbol *s = find(name); - if (!s && (config->exportAll || config->exportedSymbols.count(name) != 0)) + if (!s && (ctx.arg.exportAll || ctx.arg.exportedSymbols.count(name) != 0)) s = insertName(name).first; else if (!s || s->isDefined()) return nullptr; @@ -317,7 +317,7 @@ static bool shouldReplace(const Symbol *existing, InputFile *newFile, } // Neither symbol is week. They conflict. - if (config->allowMultipleDefinition) + if (ctx.arg.allowMultipleDefinition) return false; errorOrWarn("duplicate symbol: " + toString(*existing) + "\n>>> defined in " + @@ -387,7 +387,7 @@ Symbol *SymbolTable::addSharedFunction(StringRef name, uint32_t flags, checkSig = ud->isCalledDirectly; if (checkSig && !signatureMatches(existingFunction, sig)) { - if (config->shlibSigCheck) { + if (ctx.arg.shlibSigCheck) { reportFunctionSignatureMismatch(name, existingFunction, sig, file); } else { // With --no-shlib-sigcheck we ignore the signature of the function as @@ -637,7 +637,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name, lazy->signature = sig; } else { lazy->extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(file), s->getFile(), *s); } } else { @@ -652,7 +652,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name, if (isCalledDirectly && !signatureMatches(existingFunction, sig)) { if (existingFunction->isShared()) { // Special handling for when the existing function is a shared symbol - if (config->shlibSigCheck) { + if (ctx.arg.shlibSigCheck) { reportFunctionSignatureMismatch(name, existingFunction, sig, file); } else { existingFunction->signature = sig; @@ -788,12 +788,12 @@ TableSymbol *SymbolTable::createUndefinedIndirectFunctionTable(StringRef name) { WasmTableType *type = make(); type->ElemType = ValType::FUNCREF; type->Limits = limits; - uint32_t flags = config->exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; + uint32_t flags = ctx.arg.exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; flags |= WASM_SYMBOL_UNDEFINED; Symbol *sym = addUndefinedTable(name, name, defaultModule, flags, nullptr, type); sym->markLive(); - sym->forceExport = config->exportTable; + sym->forceExport = ctx.arg.exportTable; return cast(sym); } @@ -803,10 +803,10 @@ TableSymbol *SymbolTable::createDefinedIndirectFunctionTable(StringRef name) { WasmTableType type{ValType::FUNCREF, limits}; WasmTable desc{invalidIndex, type, name}; InputTable *table = make(desc, nullptr); - uint32_t flags = config->exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; + uint32_t flags = ctx.arg.exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN; TableSymbol *sym = addSyntheticTable(name, flags, table); sym->markLive(); - sym->forceExport = config->exportTable; + sym->forceExport = ctx.arg.exportTable; return sym; } @@ -830,7 +830,7 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) { } } - if (config->importTable) { + if (ctx.arg.importTable) { if (existing) { existing->importModule = defaultModule; existing->importName = functionTableName; @@ -838,7 +838,7 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) { } if (required) return createUndefinedIndirectFunctionTable(functionTableName); - } else if ((existing && existing->isLive()) || config->exportTable || + } else if ((existing && existing->isLive()) || ctx.arg.exportTable || required) { // A defined table is required. Either because the user request an exported // table or because the table symbol is already live. The existing table is @@ -885,7 +885,7 @@ void SymbolTable::addLazy(StringRef name, InputFile *file) { LLVM_DEBUG(dbgs() << "replacing existing undefined\n"); const InputFile *oldFile = s->getFile(); LazySymbol(name, 0, file).extract(); - if (!config->whyExtract.empty()) + if (!ctx.arg.whyExtract.empty()) ctx.whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s); } diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp index e62e7bec609f5..a687fd6d6c4ef 100644 --- a/lld/wasm/Symbols.cpp +++ b/lld/wasm/Symbols.cpp @@ -35,7 +35,7 @@ std::string maybeDemangleSymbol(StringRef name) { // `main` in the case where we need to pass it arguments. if (name == "__main_argc_argv") return "main"; - if (wasm::config->demangle) + if (wasm::ctx.arg.demangle) return demangle(name); return name.str(); } @@ -235,10 +235,10 @@ bool Symbol::isExported() const { // Shared libraries must export all weakly defined symbols // in case they contain the version that will be chosen by // the dynamic linker. - if (config->shared && isLive() && isWeak() && !isHidden()) + if (ctx.arg.shared && isLive() && isWeak() && !isHidden()) return true; - if (config->exportAll || (config->exportDynamic && !isHidden())) + if (ctx.arg.exportAll || (ctx.arg.exportDynamic && !isHidden())) return true; return isExportedExplicit(); diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h index 80b658773bd20..b409fffc50a6c 100644 --- a/lld/wasm/Symbols.h +++ b/lld/wasm/Symbols.h @@ -139,7 +139,7 @@ class Symbol { protected: Symbol(StringRef name, Kind k, uint32_t flags, InputFile *f) - : name(name), file(f), symbolKind(k), referenced(!config->gcSections), + : name(name), file(f), symbolKind(k), referenced(!ctx.arg.gcSections), requiresGOT(false), isUsedInRegularObj(false), forceExport(false), forceImport(false), canInline(false), traced(false), isStub(false), flags(flags) {} diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp index 6b32d12ebeb45..715fba1ee6da5 100644 --- a/lld/wasm/SyntheticSections.cpp +++ b/lld/wasm/SyntheticSections.cpp @@ -55,7 +55,7 @@ class SubSection { bool DylinkSection::isNeeded() const { return ctx.isPic || - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic || + ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic || !ctx.sharedFiles.empty(); } @@ -162,7 +162,7 @@ void TypeSection::writeBody() { uint32_t ImportSection::getNumImports() const { assert(isSealed); uint32_t numImports = importedSymbols.size() + gotSymbols.size(); - if (config->memoryImport.has_value()) + if (ctx.arg.memoryImport.has_value()) ++numImports; return numImports; } @@ -232,20 +232,20 @@ void ImportSection::writeBody() { writeUleb128(os, getNumImports(), "import count"); - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); - if (config->memoryImport) { + if (ctx.arg.memoryImport) { WasmImport import; - import.Module = config->memoryImport->first; - import.Field = config->memoryImport->second; + import.Module = ctx.arg.memoryImport->first; + import.Field = ctx.arg.memoryImport->second; import.Kind = WASM_EXTERNAL_MEMORY; import.Memory.Flags = 0; import.Memory.Minimum = out.memorySec->numMemoryPages; - if (out.memorySec->maxMemoryPages != 0 || config->sharedMemory) { + if (out.memorySec->maxMemoryPages != 0 || ctx.arg.sharedMemory) { import.Memory.Flags |= WASM_LIMITS_FLAG_HAS_MAX; import.Memory.Maximum = out.memorySec->maxMemoryPages; } - if (config->sharedMemory) + if (ctx.arg.sharedMemory) import.Memory.Flags |= WASM_LIMITS_FLAG_IS_SHARED; if (is64) import.Memory.Flags |= WASM_LIMITS_FLAG_IS_64; @@ -351,14 +351,14 @@ void TableSection::assignIndexes() { void MemorySection::writeBody() { raw_ostream &os = bodyOutputStream; - bool hasMax = maxMemoryPages != 0 || config->sharedMemory; + bool hasMax = maxMemoryPages != 0 || ctx.arg.sharedMemory; writeUleb128(os, 1, "memory count"); unsigned flags = 0; if (hasMax) flags |= WASM_LIMITS_FLAG_HAS_MAX; - if (config->sharedMemory) + if (ctx.arg.sharedMemory) flags |= WASM_LIMITS_FLAG_IS_SHARED; - if (config->is64.value_or(false)) + if (ctx.arg.is64.value_or(false)) flags |= WASM_LIMITS_FLAG_IS_64; writeUleb128(os, flags, "memory limits flags"); writeUleb128(os, numMemoryPages, "initial pages"); @@ -415,8 +415,8 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) { } void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { - assert(!config->extendedConst); - bool is64 = config->is64.value_or(false); + assert(!ctx.arg.extendedConst); + bool is64 = ctx.arg.is64.value_or(false); unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD @@ -466,7 +466,7 @@ void GlobalSection::writeBody() { writeGlobalType(os, g->getType()); writeInitExpr(os, g->getInitExpr()); } - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); uint8_t itype = is64 ? WASM_TYPE_I64 : WASM_TYPE_I32; for (const Symbol *sym : internalGotSymbols) { bool mutable_ = false; @@ -474,11 +474,11 @@ void GlobalSection::writeBody() { // In the case of dynamic linking, unless we have 'extended-const' // available, these global must to be mutable since they get updated to // the correct runtime value during `__wasm_apply_global_relocs`. - if (!config->extendedConst && ctx.isPic && !sym->isTLS()) + if (!ctx.arg.extendedConst && ctx.isPic && !sym->isTLS()) mutable_ = true; // With multi-theadeding any TLS globals must be mutable since they get // set during `__wasm_apply_global_tls_relocs` - if (config->sharedMemory && sym->isTLS()) + if (ctx.arg.sharedMemory && sym->isTLS()) mutable_ = true; } WasmGlobalType type{itype, mutable_}; @@ -487,7 +487,7 @@ void GlobalSection::writeBody() { bool useExtendedConst = false; uint32_t globalIdx; int64_t offset; - if (config->extendedConst && ctx.isPic) { + if (ctx.arg.extendedConst && ctx.isPic) { if (auto *d = dyn_cast(sym)) { if (!sym->isTLS()) { globalIdx = WasmSym::memoryBase->getGlobalIndex(); @@ -518,7 +518,7 @@ void GlobalSection::writeBody() { // In the sharedMemory case TLS globals are set during // `__wasm_apply_global_tls_relocs`, but in the non-shared case // we know the absolute value at link time. - initExpr = intConst(d->getVA(/*absolute=*/!config->sharedMemory), is64); + initExpr = intConst(d->getVA(/*absolute=*/!ctx.arg.sharedMemory), is64); else if (auto *f = dyn_cast(sym)) initExpr = intConst(f->isStub ? 0 : f->getTableIndex(), is64); else { @@ -566,7 +566,7 @@ void ElemSection::addEntry(FunctionSymbol *sym) { // They only exist so that the calls to missing functions can validate. if (sym->hasTableIndex() || sym->isStub) return; - sym->setTableIndex(config->tableBase + indirectFunctions.size()); + sym->setTableIndex(ctx.arg.tableBase + indirectFunctions.size()); indirectFunctions.emplace_back(sym); } @@ -589,8 +589,8 @@ void ElemSection::writeBody() { initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET; initExpr.Inst.Value.Global = WasmSym::tableBase->getGlobalIndex(); } else { - bool is64 = config->is64.value_or(false); - initExpr = intConst(config->tableBase, is64); + bool is64 = ctx.arg.is64.value_or(false); + initExpr = intConst(ctx.arg.tableBase, is64); } writeInitExpr(os, initExpr); @@ -602,7 +602,7 @@ void ElemSection::writeBody() { } writeUleb128(os, indirectFunctions.size(), "elem count"); - uint32_t tableIndex = config->tableBase; + uint32_t tableIndex = ctx.arg.tableBase; for (const FunctionSymbol *sym : indirectFunctions) { assert(sym->getTableIndex() == tableIndex); (void) tableIndex; @@ -622,7 +622,7 @@ void DataCountSection::writeBody() { } bool DataCountSection::isNeeded() const { - return numSegments && config->sharedMemory; + return numSegments && ctx.arg.sharedMemory; } void LinkingSection::writeBody() { @@ -786,9 +786,9 @@ unsigned NameSection::numNamedDataSegments() const { void NameSection::writeBody() { { SubSection sub(WASM_NAMES_MODULE); - StringRef moduleName = config->soName; - if (config->soName.empty()) - moduleName = llvm::sys::path::filename(config->outputFile); + StringRef moduleName = ctx.arg.soName; + if (ctx.arg.soName.empty()) + moduleName = llvm::sys::path::filename(ctx.arg.outputFile); writeStr(sub.os, moduleName, "module name"); sub.writeTo(bodyOutputStream); } @@ -917,14 +917,14 @@ void RelocSection::writeBody() { } static size_t getHashSize() { - switch (config->buildId) { + switch (ctx.arg.buildId) { case BuildIdKind::Fast: case BuildIdKind::Uuid: return 16; case BuildIdKind::Sha1: return 20; case BuildIdKind::Hexstring: - return config->buildIdVector.size(); + return ctx.arg.buildIdVector.size(); case BuildIdKind::None: return 0; } diff --git a/lld/wasm/SyntheticSections.h b/lld/wasm/SyntheticSections.h index 10183e93d2a28..068fbed11f4a7 100644 --- a/lld/wasm/SyntheticSections.h +++ b/lld/wasm/SyntheticSections.h @@ -228,7 +228,7 @@ class MemorySection : public SyntheticSection { public: MemorySection() : SyntheticSection(llvm::wasm::WASM_SEC_MEMORY) {} - bool isNeeded() const override { return !config->memoryImport.has_value(); } + bool isNeeded() const override { return !ctx.arg.memoryImport.has_value(); } void writeBody() override; uint64_t numMemoryPages = 0; @@ -286,7 +286,7 @@ class GlobalSection : public SyntheticSection { // transform a `global.get` to an `i32.const`. void addInternalGOTEntry(Symbol *sym); bool needsRelocations() { - if (config->extendedConst) + if (ctx.arg.extendedConst) return false; return llvm::any_of(internalGotSymbols, [=](Symbol *sym) { return !sym->isTLS(); }); @@ -354,7 +354,7 @@ class LinkingSection : public SyntheticSection { : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "linking"), initFunctions(initFunctions), dataSegments(dataSegments) {} bool isNeeded() const override { - return config->relocatable || config->emitRelocs; + return ctx.arg.relocatable || ctx.arg.emitRelocs; } void writeBody() override; void addToSymtab(Symbol *sym); @@ -373,7 +373,7 @@ class NameSection : public SyntheticSection { : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "name"), segments(segments) {} bool isNeeded() const override { - if (config->stripAll && !config->keepSections.count(name)) + if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name)) return false; return numNames() > 0; } @@ -396,7 +396,7 @@ class ProducersSection : public SyntheticSection { ProducersSection() : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "producers") {} bool isNeeded() const override { - if (config->stripAll && !config->keepSections.count(name)) + if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name)) return false; return fieldCount() > 0; } @@ -417,7 +417,7 @@ class TargetFeaturesSection : public SyntheticSection { TargetFeaturesSection() : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "target_features") {} bool isNeeded() const override { - if (config->stripAll && !config->keepSections.count(name)) + if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name)) return false; return features.size() > 0; } @@ -443,7 +443,7 @@ class BuildIdSection : public SyntheticSection { BuildIdSection(); void writeBody() override; bool isNeeded() const override { - return config->buildId != BuildIdKind::None; + return ctx.arg.buildId != BuildIdKind::None; } void writeBuildId(llvm::ArrayRef buf); void writeTo(uint8_t *buf) override { diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp index aeac1a51824f5..76e38f548157c 100644 --- a/lld/wasm/Writer.cpp +++ b/lld/wasm/Writer.cpp @@ -132,7 +132,7 @@ class Writer { void Writer::calculateCustomSections() { log("calculateCustomSections"); - bool stripDebug = config->stripDebug || config->stripAll; + bool stripDebug = ctx.arg.stripDebug || ctx.arg.stripAll; for (ObjFile *file : ctx.objectFiles) { for (InputChunk *section : file->customSections) { // Exclude COMDAT sections that are not selected for inclusion @@ -172,7 +172,7 @@ void Writer::createCustomSections() { LLVM_DEBUG(dbgs() << "createCustomSection: " << name << "\n"); OutputSection *sec = make(std::string(name), pair.second); - if (config->relocatable || config->emitRelocs) { + if (ctx.arg.relocatable || ctx.arg.emitRelocs) { auto *sym = make(sec); out.linkingSec->addToSymtab(sym); sec->sectionSym = sym; @@ -282,8 +282,8 @@ static void makeUUID(unsigned version, llvm::ArrayRef fileHash, void Writer::writeBuildId() { if (!out.buildIdSec->isNeeded()) return; - if (config->buildId == BuildIdKind::Hexstring) { - out.buildIdSec->writeBuildId(config->buildIdVector); + if (ctx.arg.buildId == BuildIdKind::Hexstring) { + out.buildIdSec->writeBuildId(ctx.arg.buildIdVector); return; } @@ -292,7 +292,7 @@ void Writer::writeBuildId() { std::vector buildId(hashSize); llvm::ArrayRef buf{buffer->getBufferStart(), size_t(fileSize)}; - switch (config->buildId) { + switch (ctx.arg.buildId) { case BuildIdKind::Fast: { std::vector fileHash(8); computeHash(fileHash, buf, [](uint8_t *dest, ArrayRef arr) { @@ -324,9 +324,9 @@ static void setGlobalPtr(DefinedGlobal *g, uint64_t memoryPtr) { // to each of the input data sections as well as the explicit stack region. // The default memory layout is as follows, from low to high. // -// - initialized data (starting at config->globalBase) +// - initialized data (starting at ctx.arg.globalBase) // - BSS data (not currently implemented in llvm) -// - explicit stack (config->ZStackSize) +// - explicit stack (ctx.arg.ZStackSize) // - heap start / unallocated // // The --stack-first option means that stack is placed before any static data. @@ -337,33 +337,33 @@ void Writer::layoutMemory() { uint64_t memoryPtr = 0; auto placeStack = [&]() { - if (config->relocatable || ctx.isPic) + if (ctx.arg.relocatable || ctx.isPic) return; memoryPtr = alignTo(memoryPtr, stackAlignment); if (WasmSym::stackLow) WasmSym::stackLow->setVA(memoryPtr); - if (config->zStackSize != alignTo(config->zStackSize, stackAlignment)) + if (ctx.arg.zStackSize != alignTo(ctx.arg.zStackSize, stackAlignment)) error("stack size must be " + Twine(stackAlignment) + "-byte aligned"); - log("mem: stack size = " + Twine(config->zStackSize)); + log("mem: stack size = " + Twine(ctx.arg.zStackSize)); log("mem: stack base = " + Twine(memoryPtr)); - memoryPtr += config->zStackSize; + memoryPtr += ctx.arg.zStackSize; setGlobalPtr(cast(WasmSym::stackPointer), memoryPtr); if (WasmSym::stackHigh) WasmSym::stackHigh->setVA(memoryPtr); log("mem: stack top = " + Twine(memoryPtr)); }; - if (config->stackFirst) { + if (ctx.arg.stackFirst) { placeStack(); - if (config->globalBase) { - if (config->globalBase < memoryPtr) { + if (ctx.arg.globalBase) { + if (ctx.arg.globalBase < memoryPtr) { error("--global-base cannot be less than stack size when --stack-first is used"); return; } - memoryPtr = config->globalBase; + memoryPtr = ctx.arg.globalBase; } } else { - memoryPtr = config->globalBase; + memoryPtr = ctx.arg.globalBase; } log("mem: global base = " + Twine(memoryPtr)); @@ -385,7 +385,7 @@ void Writer::layoutMemory() { log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}", seg->name, memoryPtr, seg->size, seg->alignment)); - if (!config->relocatable && seg->isTLS()) { + if (!ctx.arg.relocatable && seg->isTLS()) { if (WasmSym::tlsSize) { auto *tlsSize = cast(WasmSym::tlsSize); setGlobalPtr(tlsSize, seg->size); @@ -394,7 +394,7 @@ void Writer::layoutMemory() { auto *tlsAlign = cast(WasmSym::tlsAlign); setGlobalPtr(tlsAlign, int64_t{1} << seg->alignment); } - if (!config->sharedMemory && WasmSym::tlsBase) { + if (!ctx.arg.sharedMemory && WasmSym::tlsBase) { auto *tlsBase = cast(WasmSym::tlsBase); setGlobalPtr(tlsBase, memoryPtr); } @@ -404,7 +404,7 @@ void Writer::layoutMemory() { } // Make space for the memory initialization flag - if (config->sharedMemory && hasPassiveInitializedSegments()) { + if (ctx.arg.sharedMemory && hasPassiveInitializedSegments()) { memoryPtr = alignTo(memoryPtr, 4); WasmSym::initMemoryFlag = symtab->addSyntheticDataSymbol( "__wasm_init_memory_flag", WASM_SYMBOL_VISIBILITY_HIDDEN); @@ -423,7 +423,7 @@ void Writer::layoutMemory() { if (ctx.isPic) out.dylinkSec->memSize = staticDataSize; - if (!config->stackFirst) + if (!ctx.arg.stackFirst) placeStack(); if (WasmSym::heapBase) { @@ -438,31 +438,31 @@ void Writer::layoutMemory() { } uint64_t maxMemorySetting = 1ULL << 32; - if (config->is64.value_or(false)) { + if (ctx.arg.is64.value_or(false)) { // TODO: Update once we decide on a reasonable limit here: // https://github.com/WebAssembly/memory64/issues/33 maxMemorySetting = 1ULL << 34; } - if (config->initialHeap != 0) { - if (config->initialHeap != alignTo(config->initialHeap, WasmPageSize)) + if (ctx.arg.initialHeap != 0) { + if (ctx.arg.initialHeap != alignTo(ctx.arg.initialHeap, WasmPageSize)) error("initial heap must be " + Twine(WasmPageSize) + "-byte aligned"); uint64_t maxInitialHeap = maxMemorySetting - memoryPtr; - if (config->initialHeap > maxInitialHeap) + if (ctx.arg.initialHeap > maxInitialHeap) error("initial heap too large, cannot be greater than " + Twine(maxInitialHeap)); - memoryPtr += config->initialHeap; + memoryPtr += ctx.arg.initialHeap; } - if (config->initialMemory != 0) { - if (config->initialMemory != alignTo(config->initialMemory, WasmPageSize)) + if (ctx.arg.initialMemory != 0) { + if (ctx.arg.initialMemory != alignTo(ctx.arg.initialMemory, WasmPageSize)) error("initial memory must be " + Twine(WasmPageSize) + "-byte aligned"); - if (memoryPtr > config->initialMemory) + if (memoryPtr > ctx.arg.initialMemory) error("initial memory too small, " + Twine(memoryPtr) + " bytes needed"); - if (config->initialMemory > maxMemorySetting) + if (ctx.arg.initialMemory > maxMemorySetting) error("initial memory too large, cannot be greater than " + Twine(maxMemorySetting)); - memoryPtr = config->initialMemory; + memoryPtr = ctx.arg.initialMemory; } memoryPtr = alignTo(memoryPtr, WasmPageSize); @@ -479,23 +479,23 @@ void Writer::layoutMemory() { } uint64_t maxMemory = 0; - if (config->maxMemory != 0) { - if (config->maxMemory != alignTo(config->maxMemory, WasmPageSize)) + if (ctx.arg.maxMemory != 0) { + if (ctx.arg.maxMemory != alignTo(ctx.arg.maxMemory, WasmPageSize)) error("maximum memory must be " + Twine(WasmPageSize) + "-byte aligned"); - if (memoryPtr > config->maxMemory) + if (memoryPtr > ctx.arg.maxMemory) error("maximum memory too small, " + Twine(memoryPtr) + " bytes needed"); - if (config->maxMemory > maxMemorySetting) + if (ctx.arg.maxMemory > maxMemorySetting) error("maximum memory too large, cannot be greater than " + Twine(maxMemorySetting)); - maxMemory = config->maxMemory; - } else if (config->noGrowableMemory) { + maxMemory = ctx.arg.maxMemory; + } else if (ctx.arg.noGrowableMemory) { maxMemory = memoryPtr; } // If no maxMemory config was supplied but we are building with // shared memory, we need to pick a sensible upper limit. - if (config->sharedMemory && maxMemory == 0) { + if (ctx.arg.sharedMemory && maxMemory == 0) { if (ctx.isPic) maxMemory = maxMemorySetting; else @@ -552,7 +552,7 @@ void Writer::addSections() { createCustomSections(); addSection(out.linkingSec); - if (config->emitRelocs || config->relocatable) { + if (ctx.arg.emitRelocs || ctx.arg.relocatable) { createRelocSections(); } @@ -583,18 +583,18 @@ void Writer::populateTargetFeatures() { allowed.insert("mutable-globals"); } - if (config->extraFeatures.has_value()) { - auto &extraFeatures = *config->extraFeatures; + if (ctx.arg.extraFeatures.has_value()) { + auto &extraFeatures = *ctx.arg.extraFeatures; allowed.insert(extraFeatures.begin(), extraFeatures.end()); } // Only infer used features if user did not specify features - bool inferFeatures = !config->features.has_value(); + bool inferFeatures = !ctx.arg.features.has_value(); if (!inferFeatures) { - auto &explicitFeatures = *config->features; + auto &explicitFeatures = *ctx.arg.features; allowed.insert(explicitFeatures.begin(), explicitFeatures.end()); - if (!config->checkFeatures) + if (!ctx.arg.checkFeatures) goto done; } @@ -626,10 +626,10 @@ void Writer::populateTargetFeatures() { for (const auto &key : used.keys()) allowed.insert(std::string(key)); - if (!config->checkFeatures) + if (!ctx.arg.checkFeatures) goto done; - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { if (disallowed.count("shared-mem")) error("--shared-memory is disallowed by " + disallowed["shared-mem"] + " because it was not compiled with 'atomics' or 'bulk-memory' " @@ -679,19 +679,19 @@ void Writer::populateTargetFeatures() { // instruction, then we can also avoid including the segments. // Finally, if we are emitting relocations, they may refer to locations within // the bss segments, so these segments need to exist in the binary. - if (config->emitRelocs || - (config->memoryImport.has_value() && !allowed.count("bulk-memory"))) + if (ctx.arg.emitRelocs || + (ctx.arg.memoryImport.has_value() && !allowed.count("bulk-memory"))) ctx.emitBssSegments = true; if (allowed.count("extended-const")) - config->extendedConst = true; + ctx.arg.extendedConst = true; for (auto &feature : allowed) log("Allowed feature: " + feature); } void Writer::checkImportExportTargetFeatures() { - if (config->relocatable || !config->checkFeatures) + if (ctx.arg.relocatable || !ctx.arg.checkFeatures) return; if (out.targetFeaturesSec->features.count("mutable-globals") == 0) { @@ -727,14 +727,14 @@ static bool shouldImport(Symbol *sym) { // When a symbol is weakly defined in a shared library we need to allow // it to be overridden by another module so need to both import // and export the symbol. - if (config->shared && sym->isWeak() && !sym->isUndefined() && + if (ctx.arg.shared && sym->isWeak() && !sym->isUndefined() && !sym->isHidden()) return true; if (sym->isShared()) return true; if (!sym->isUndefined()) return false; - if (sym->isWeak() && !config->relocatable && !ctx.isPic) + if (sym->isWeak() && !ctx.arg.relocatable && !ctx.isPic) return false; // In PIC mode we only need to import functions when they are called directly. @@ -745,10 +745,10 @@ static bool shouldImport(Symbol *sym) { return false; } - if (ctx.isPic || config->relocatable || config->importUndefined || - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) + if (ctx.isPic || ctx.arg.relocatable || ctx.arg.importUndefined || + ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) return true; - if (config->allowUndefinedSymbols.count(sym->getName()) != 0) + if (ctx.arg.allowUndefinedSymbols.count(sym->getName()) != 0) return true; return sym->isImported(); @@ -773,12 +773,12 @@ void Writer::calculateImports() { } void Writer::calculateExports() { - if (config->relocatable) + if (ctx.arg.relocatable) return; - if (!config->relocatable && config->memoryExport.has_value()) { + if (!ctx.arg.relocatable && ctx.arg.memoryExport.has_value()) { out.exportSec->exports.push_back( - WasmExport{*config->memoryExport, WASM_EXTERNAL_MEMORY, 0}); + WasmExport{*ctx.arg.memoryExport, WASM_EXTERNAL_MEMORY, 0}); } unsigned globalIndex = @@ -827,7 +827,7 @@ void Writer::calculateExports() { } void Writer::populateSymtab() { - if (!config->relocatable && !config->emitRelocs) + if (!ctx.arg.relocatable && !ctx.arg.emitRelocs) return; for (Symbol *sym : symtab->symbols()) @@ -931,13 +931,13 @@ static void finalizeIndirectFunctionTable() { out.importSec->addImport(WasmSym::indirectFunctionTable); } - uint32_t tableSize = config->tableBase + out.elemSec->numEntries(); + uint32_t tableSize = ctx.arg.tableBase + out.elemSec->numEntries(); WasmLimits limits = {0, tableSize, 0}; - if (WasmSym::indirectFunctionTable->isDefined() && !config->growableTable) { + if (WasmSym::indirectFunctionTable->isDefined() && !ctx.arg.growableTable) { limits.Flags |= WASM_LIMITS_FLAG_HAS_MAX; limits.Maximum = limits.Minimum; } - if (config->is64.value_or(false)) + if (ctx.arg.is64.value_or(false)) limits.Flags |= WASM_LIMITS_FLAG_IS_64; WasmSym::indirectFunctionTable->setLimits(limits); } @@ -1001,7 +1001,7 @@ static StringRef getOutputDataSegmentName(const InputChunk &seg) { // symbols are be relative to single __tls_base. if (seg.isTLS()) return ".tdata"; - if (!config->mergeDataSegments) + if (!ctx.arg.mergeDataSegments) return seg.name; if (seg.name.starts_with(".text.")) return ".text"; @@ -1017,9 +1017,9 @@ static StringRef getOutputDataSegmentName(const InputChunk &seg) { OutputSegment *Writer::createOutputSegment(StringRef name) { LLVM_DEBUG(dbgs() << "new segment: " << name << "\n"); OutputSegment *s = make(name); - if (config->sharedMemory) + if (ctx.arg.sharedMemory) s->initFlags = WASM_DATA_SEGMENT_IS_PASSIVE; - if (!config->relocatable && name.starts_with(".bss")) + if (!ctx.arg.relocatable && name.starts_with(".bss")) s->isBss = true; segments.push_back(s); return s; @@ -1035,7 +1035,7 @@ void Writer::createOutputSegments() { // When running in relocatable mode we can't merge segments that are part // of comdat groups since the ultimate linker needs to be able exclude or // include them individually. - if (config->relocatable && !segment->getComdatName().empty()) { + if (ctx.arg.relocatable && !segment->getComdatName().empty()) { s = createOutputSegment(name); } else { if (segmentMap.count(name) == 0) @@ -1075,8 +1075,8 @@ void Writer::combineOutputSegments() { // combines all data segments into a single .data segment. // This restriction does not apply when the extended const extension is // available: https://github.com/WebAssembly/extended-const - assert(!config->extendedConst); - assert(ctx.isPic && !config->sharedMemory); + assert(!ctx.arg.extendedConst); + assert(ctx.isPic && !ctx.arg.sharedMemory); if (segments.size() <= 1) return; OutputSegment *combined = make(".data"); @@ -1117,7 +1117,7 @@ static void createFunction(DefinedFunction *func, StringRef bodyContent) { bool Writer::needsPassiveInitialization(const OutputSegment *segment) { // If bulk memory features is supported then we can perform bss initialization // (via memory.fill) during `__wasm_init_memory`. - if (config->memoryImport.has_value() && !segment->requiredInBinary()) + if (ctx.arg.memoryImport.has_value() && !segment->requiredInBinary()) return true; return segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE; } @@ -1129,7 +1129,7 @@ bool Writer::hasPassiveInitializedSegments() { } void Writer::createSyntheticInitFunctions() { - if (config->relocatable) + if (ctx.arg.relocatable) return; static WasmSignature nullSignature = {{}, {}}; @@ -1146,14 +1146,14 @@ void Writer::createSyntheticInitFunctions() { "__wasm_init_memory", WASM_SYMBOL_VISIBILITY_HIDDEN, make(nullSignature, "__wasm_init_memory")); WasmSym::initMemory->markLive(); - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { // This global is assigned during __wasm_init_memory in the shared memory // case. WasmSym::tlsBase->markLive(); } } - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { if (out.globalSec->needsTLSRelocations()) { WasmSym::applyGlobalTLSRelocs = symtab->addSyntheticFunction( "__wasm_apply_global_tls_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN, @@ -1203,11 +1203,11 @@ void Writer::createInitMemoryFunction() { assert(WasmSym::initMemory); assert(hasPassiveInitializedSegments()); uint64_t flagAddress; - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { assert(WasmSym::initMemoryFlag); flagAddress = WasmSym::initMemoryFlag->getVA(); } - bool is64 = config->is64.value_or(false); + bool is64 = ctx.arg.is64.value_or(false); std::string bodyContent; { raw_string_ostream os(bodyContent); @@ -1271,7 +1271,7 @@ void Writer::createInitMemoryFunction() { } }; - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { // With PIC code we cache the flag address in local 0 if (ctx.isPic) { writeUleb128(os, 1, "num local decls"); @@ -1334,7 +1334,7 @@ void Writer::createInitMemoryFunction() { // When we initialize the TLS segment we also set the `__tls_base` // global. This allows the runtime to use this static copy of the // TLS data for the first/main thread. - if (config->sharedMemory && s->isTLS()) { + if (ctx.arg.sharedMemory && s->isTLS()) { if (ctx.isPic) { // Cache the result of the addionion in local 0 writeU8(os, WASM_OPCODE_LOCAL_TEE, "local.tee"); @@ -1368,7 +1368,7 @@ void Writer::createInitMemoryFunction() { } } - if (config->sharedMemory) { + if (ctx.arg.sharedMemory) { // Set flag to 2 to mark end of initialization writeGetFlagAddress(); writeI32Const(os, 2, "flag value"); @@ -1407,7 +1407,7 @@ void Writer::createInitMemoryFunction() { if (needsPassiveInitialization(s) && !s->isBss) { // The TLS region should not be dropped since its is needed // during the initialization of each thread (__wasm_init_tls). - if (config->sharedMemory && s->isTLS()) + if (ctx.arg.sharedMemory && s->isTLS()) continue; // data.drop instruction writeU8(os, WASM_OPCODE_MISC_PREFIX, "bulk-memory prefix"); @@ -1460,7 +1460,7 @@ void Writer::createApplyDataRelocationsFunction() { writeUleb128(os, 0, "num locals"); bool generated = false; for (const OutputSegment *seg : segments) - if (!config->sharedMemory || !seg->isTLS()) + if (!ctx.arg.sharedMemory || !seg->isTLS()) for (const InputChunk *inSeg : seg->inputSegments) generated |= inSeg->generateRelocationCode(os); @@ -1656,7 +1656,7 @@ void Writer::createInitTLSFunction() { // This is then used either when creating the output linking section or to // synthesize the "__wasm_call_ctors" function. void Writer::calculateInitFunctions() { - if (!config->relocatable && !WasmSym::callCtors->isLive()) + if (!ctx.arg.relocatable && !WasmSym::callCtors->isLive()) return; for (ObjFile *file : ctx.objectFiles) { @@ -1708,7 +1708,7 @@ void Writer::run() { // For PIC code the table base is assigned dynamically by the loader. // For non-PIC, we start at 1 so that accessing table index 0 always traps. if (!ctx.isPic && WasmSym::definedTableBase) - WasmSym::definedTableBase->setVA(config->tableBase); + WasmSym::definedTableBase->setVA(ctx.arg.tableBase); log("-- createOutputSegments"); createOutputSegments(); @@ -1717,7 +1717,7 @@ void Writer::run() { log("-- layoutMemory"); layoutMemory(); - if (!config->relocatable) { + if (!ctx.arg.relocatable) { // Create linker synthesized __start_SECNAME/__stop_SECNAME symbols // This has to be done after memory layout is performed. for (const OutputSegment *seg : segments) { @@ -1725,7 +1725,7 @@ void Writer::run() { } } - for (auto &pair : config->exportedSymbols) { + for (auto &pair : ctx.arg.exportedSymbols) { Symbol *sym = symtab->find(pair.first()); if (sym && sym->isDefined()) sym->forceExport = true; @@ -1733,12 +1733,12 @@ void Writer::run() { // Delay reporting errors about explicit exports until after // addStartStopSymbols which can create optional symbols. - for (auto &name : config->requiredExports) { + for (auto &name : ctx.arg.requiredExports) { Symbol *sym = symtab->find(name); if (!sym || !sym->isDefined()) { - if (config->unresolvedSymbols == UnresolvedPolicy::ReportError) + if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ReportError) error(Twine("symbol exported via --export not found: ") + name); - if (config->unresolvedSymbols == UnresolvedPolicy::Warn) + if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::Warn) warn(Twine("symbol exported via --export not found: ") + name); } } @@ -1750,7 +1750,7 @@ void Writer::run() { // `__memory_base` import. Unless we support the extended const expression we // can't do addition inside the constant expression, so we much combine the // segments into a single one that can live at `__memory_base`. - if (ctx.isPic && !config->extendedConst && !config->sharedMemory) { + if (ctx.isPic && !ctx.arg.extendedConst && !ctx.arg.sharedMemory) { // In shared memory mode all data segments are passive and initialized // via __wasm_init_memory. log("-- combineOutputSegments"); @@ -1774,7 +1774,7 @@ void Writer::run() { log("-- calculateInitFunctions"); calculateInitFunctions(); - if (!config->relocatable) { + if (!ctx.arg.relocatable) { // Create linker synthesized functions if (WasmSym::applyGlobalRelocs) createApplyGlobalRelocationsFunction(); @@ -1793,7 +1793,7 @@ void Writer::run() { // If the input contains a call to `__wasm_call_ctors`, either in one of // the input objects or an explicit export from the command-line, we // assume ctors and dtors are taken care of already. - if (!config->relocatable && !ctx.isPic && + if (!ctx.arg.relocatable && !ctx.isPic && !WasmSym::callCtors->isUsedInRegularObj && !WasmSym::callCtors->isExported()) { log("-- createCommandExportWrappers"); @@ -1861,14 +1861,14 @@ void Writer::run() { // Open a result file. void Writer::openFile() { - log("writing: " + config->outputFile); + log("writing: " + ctx.arg.outputFile); Expected> bufferOrErr = - FileOutputBuffer::create(config->outputFile, fileSize, + FileOutputBuffer::create(ctx.arg.outputFile, fileSize, FileOutputBuffer::F_executable); if (!bufferOrErr) - error("failed to open " + config->outputFile + ": " + + error("failed to open " + ctx.arg.outputFile + ": " + toString(bufferOrErr.takeError())); else buffer = std::move(*bufferOrErr); From 7531672712b0fb517f1818d512fbdfa6feed4232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 2 Jan 2025 17:37:44 -0800 Subject: [PATCH 041/480] [flang][cuda][NFC] Remove unused variable (#121533) Failed buildbot after https://github.com/llvm/llvm-project/pull/121524 --- flang/lib/Optimizer/Transforms/CUFOpConversion.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index f08f9e412b885..8c525fc6daff5 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -792,10 +792,6 @@ struct CUFSyncDescriptorOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; - CUFSyncDescriptorOpConversion(mlir::MLIRContext *context, - const mlir::SymbolTable &symTab) - : OpRewritePattern(context), symTab{symTab} {} - mlir::LogicalResult matchAndRewrite(cuf::SyncDescriptorOp op, mlir::PatternRewriter &rewriter) const override { @@ -822,9 +818,6 @@ struct CUFSyncDescriptorOpConversion op.erase(); return mlir::success(); } - -private: - const mlir::SymbolTable &symTab; }; class CUFOpConversion : public fir::impl::CUFOpConversionBase { @@ -887,11 +880,11 @@ void cuf::populateCUFToFIRConversionPatterns( const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) { patterns.insert(patterns.getContext(), &dl, &converter); patterns.insert(patterns.getContext()); + CUFFreeOpConversion, CUFSyncDescriptorOpConversion>( + patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert( - patterns.getContext(), symtab); + patterns.insert(patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, From c1ecc0d168ad122d858dd5fec475da391f97e959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?= Date: Fri, 3 Jan 2025 02:43:53 +0100 Subject: [PATCH 042/480] [clang] Allow generating module interfaces with parsing errors (#121485) Fixes a regression introduced in commit da00c60dae0040185dc45039c4397f6e746548e9 This functionality was originally added in commit 5834996fefc937d6211dc8c8a5b200068753391a Co-authored-by: Tomasz Kaminski --- clang/include/clang/Serialization/ASTWriter.h | 13 ++++++---- clang/lib/Frontend/FrontendActions.cpp | 6 +++-- clang/lib/Serialization/GeneratePCH.cpp | 5 ++-- clang/test/Modules/pcm-with-errors.cpp | 26 +++++++++++++++++++ 4 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 clang/test/Modules/pcm-with-errors.cpp diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index cb972f0106402..adb7cce522a80 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -997,13 +997,15 @@ class CXX20ModulesGenerator : public PCHGenerator { virtual Module *getEmittingModule(ASTContext &Ctx) override; CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, - StringRef OutputFile, bool GeneratingReducedBMI); + StringRef OutputFile, bool GeneratingReducedBMI, + bool AllowASTWithErrors); public: CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, - StringRef OutputFile) + StringRef OutputFile, bool AllowASTWithErrors = false) : CXX20ModulesGenerator(PP, ModuleCache, OutputFile, - /*GeneratingReducedBMI=*/false) {} + /*GeneratingReducedBMI=*/false, + AllowASTWithErrors) {} void HandleTranslationUnit(ASTContext &Ctx) override; }; @@ -1013,9 +1015,10 @@ class ReducedBMIGenerator : public CXX20ModulesGenerator { public: ReducedBMIGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, - StringRef OutputFile) + StringRef OutputFile, bool AllowASTWithErrors = false) : CXX20ModulesGenerator(PP, ModuleCache, OutputFile, - /*GeneratingReducedBMI=*/true) {} + /*GeneratingReducedBMI=*/true, + AllowASTWithErrors) {} }; /// If we can elide the definition of \param D in reduced BMI. diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index e943f143d4c15..30dfa5481d070 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -279,12 +279,14 @@ GenerateModuleInterfaceAction::CreateASTConsumer(CompilerInstance &CI, !CI.getFrontendOpts().ModuleOutputPath.empty()) { Consumers.push_back(std::make_unique( CI.getPreprocessor(), CI.getModuleCache(), - CI.getFrontendOpts().ModuleOutputPath)); + CI.getFrontendOpts().ModuleOutputPath, + +CI.getFrontendOpts().AllowPCMWithCompilerErrors)); } Consumers.push_back(std::make_unique( CI.getPreprocessor(), CI.getModuleCache(), - CI.getFrontendOpts().OutputFile)); + CI.getFrontendOpts().OutputFile, + +CI.getFrontendOpts().AllowPCMWithCompilerErrors)); return std::make_unique(std::move(Consumers)); } diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp index 7a8a951b34f25..a3189bb40b191 100644 --- a/clang/lib/Serialization/GeneratePCH.cpp +++ b/clang/lib/Serialization/GeneratePCH.cpp @@ -102,12 +102,13 @@ void PCHGenerator::anchor() {} CXX20ModulesGenerator::CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache, StringRef OutputFile, - bool GeneratingReducedBMI) + bool GeneratingReducedBMI, + bool AllowASTWithErrors) : PCHGenerator( PP, ModuleCache, OutputFile, llvm::StringRef(), std::make_shared(), /*Extensions=*/ArrayRef>(), - /*AllowASTWithErrors*/ false, /*IncludeTimestamps=*/false, + AllowASTWithErrors, /*IncludeTimestamps=*/false, /*BuildingImplicitModule=*/false, /*ShouldCacheASTInMemory=*/false, GeneratingReducedBMI) {} diff --git a/clang/test/Modules/pcm-with-errors.cpp b/clang/test/Modules/pcm-with-errors.cpp new file mode 100644 index 0000000000000..1bbc3865ee3ee --- /dev/null +++ b/clang/test/Modules/pcm-with-errors.cpp @@ -0,0 +1,26 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t + +// RUN: %clang_cc1 -std=c++23 m.cppm -emit-module-interface -o m.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=m=m.pcm -verify -fallow-pcm-with-compiler-errors -verify + +// RUN: %clang_cc1 -std=c++23 m.cppm -fmodules-reduced-bmi -emit-module-interface -o m.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=m=m.pcm -verify -fallow-pcm-with-compiler-errors -verify + +//--- m.cppm +export module m; + +export int f() { + return 0; +} + +export struct Foo { + __Int bar; // expected-error {{unknown type name '__Int'}} +}; + +//--- main.cpp +// expected-no-diagnostics +import m; // ok + +static_assert(__is_same(decltype(f), int())); // ok From e8cf41311fe6940e096d3c9e8a43338b47cb8b2a Mon Sep 17 00:00:00 2001 From: gulfemsavrun Date: Thu, 2 Jan 2025 18:34:02 -0800 Subject: [PATCH 043/480] Revert "[compiler-rt][rtsan] fopencookie support." (#121537) Reverts llvm/llvm-project#120864 because it broke building compiler-rt on Mac. https://luci-milo.appspot.com/ui/p/fuchsia/builders/toolchain.ci/clang-mac-arm64/b8726812736235038609/overview --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 7 ------ .../tests/rtsan_test_interceptors_posix.cpp | 23 ------------------- 2 files changed, 30 deletions(-) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 072923ab35ae0..4e51f464b5730 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -297,12 +297,6 @@ INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { return REAL(fdopen)(fd, mode); } -INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode, - cookie_io_functions_t funcs) { - __rtsan_notify_intercepted_call("fopencookie"); - return REAL(fopencookie)(cookie, mode, funcs); -} - #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) { __rtsan_notify_intercepted_call("open_memstream"); @@ -978,7 +972,6 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(fputs); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); - INTERCEPT_FUNCTION(fopencookie); RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; INTERCEPT_FUNCTION(lseek); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index c9c4d7fc4e99e..b052dd859dcdf 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -353,29 +353,6 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } -TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { - FILE *f = fopen(GetTemporaryFilePath(), "w"); - EXPECT_THAT(f, Ne(nullptr)); - struct fholder { - FILE *fp; - size_t read; - } fh = {f, 0}; - auto CookieRead = [this](void *cookie, char *buf, size_t size) { - fholder *p = reinterpret_cast(cookie); - p->read = fread(static_cast(buf), 1, size, p->fp); - EXPECT_NE(0, p->read); - }; - cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, - nullptr, nullptr}; - auto Func = [&fh, &funcs]() { - FILE *f = fopencookie(&fh, "w", funcs); - EXPECT_THAT(f, Ne(nullptr)); - }; - - ExpectRealtimeDeath(Func, "fopencookie"); - ExpectNonRealtimeSurvival(Func); -} - #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) { char *buffer; From 510a5c7fc25b2a3c33679480131cd60049747dd1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 2 Jan 2025 18:46:44 -0800 Subject: [PATCH 044/480] [ELF] Fix .gnu.version crash when .dynsym is discarded Fix #88650 In addition, delete the unneeded comment. https://sourceware.org/gnu-gabi/program-loading-and-dynamic-linking.txt --- lld/ELF/SyntheticSections.cpp | 5 ++- .../ELF/linkerscript/discard-section-dynsym.s | 33 +++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index baa7a083404fe..10cbfe19b3b0a 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -3798,9 +3798,8 @@ VersionTableSection::VersionTableSection(Ctx &ctx) } void VersionTableSection::finalizeContents() { - // At the moment of june 2016 GNU docs does not mention that sh_link field - // should be set, but Sun docs do. Also readelf relies on this field. - getParent()->link = getPartition(ctx).dynSymTab->getParent()->sectionIndex; + if (OutputSection *osec = getPartition(ctx).dynSymTab->getParent()) + getParent()->link = osec->sectionIndex; } size_t VersionTableSection::getSize() const { diff --git a/lld/test/ELF/linkerscript/discard-section-dynsym.s b/lld/test/ELF/linkerscript/discard-section-dynsym.s index 7c7c9c29cee84..f5d483dca86ec 100644 --- a/lld/test/ELF/linkerscript/discard-section-dynsym.s +++ b/lld/test/ELF/linkerscript/discard-section-dynsym.s @@ -1,24 +1,43 @@ # REQUIRES: aarch64 ## We allow discarding .dynsym, check we don't crash. -# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=aarch64 a.s -o a.o +# RUN: llvm-mc -filetype=obj -triple=aarch64 c.s -o c.o +# RUN: ld.lld -shared --version-script=c.ver c.o -o c.so -# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym) } }' > %t.lds -# RUN: ld.lld -shared -T %t.lds %t.o -o %t.so -# RUN: llvm-readelf -r %t.so | FileCheck %s +# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym) } }' > 1.lds +# RUN: ld.lld -shared -T 1.lds a.o c.so -o out1.so +# RUN: llvm-readelf -Sr out1.so | FileCheck %s --check-prefixes=CHECK,CHECK1 -# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym .dynstr) } }' > %t.lds -# RUN: ld.lld -shared -T %t.lds %t.o -o %t.so -# RUN: llvm-readelf -r %t.so | FileCheck %s +# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym .dynstr) } }' > 2.lds +# RUN: ld.lld -shared -T 2.lds a.o c.so -o out2.so +# RUN: llvm-readelf -Sr out2.so | FileCheck %s --check-prefixes=CHECK,CHECK2 + +# CHECK: [Nr] Name Type Address Off Size ES Flg Lk Inf Al +# CHECK-NEXT: [ 0] NULL 0000000000000000 000000 000000 00 0 0 0 +# CHECK-NEXT: [ 1] .gnu.version VERSYM 0000000000000000 {{.*}} 000006 02 A 0 0 2 +# CHECK1-NEXT: [ 2] .gnu.version_r VERNEED 0000000000000008 {{.*}} 000020 00 A 5 1 4 +# CHECK2-NEXT: [ 2] .gnu.version_r VERNEED 0000000000000008 {{.*}} 000020 00 A 0 1 4 +# CHECK1: [ 5] .dynstr STRTAB # CHECK: contains 2 entries: # CHECK: R_AARCH64_RELATIVE [[#]] # CHECK-NEXT: R_AARCH64_GLOB_DAT 0{{$}} +#--- a.s adrp x9, :got:var ldr x9, [x9, :got_lo12:var] + bl __libc_start_main .data .align 8 foo: .quad foo + +#--- c.s +.globl __libc_start_main +__libc_start_main: + +#--- c.ver +GLIBC_2.34 { __libc_start_main; }; From 9df375e5eae726c5a90ada70f9535a5e22e90214 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Fri, 3 Jan 2025 11:53:21 +0900 Subject: [PATCH 045/480] [lld][WebAssembly] Fix non-pie dynamic-linking executable (#108146) The commit 22b7b84860d39da71964c9b329937f2ee1d875ba made the symbols provided by shared libraries "defined", and thus effectively made it impossible to generate non-pie dynamically linked executables using --unresolved-symbols=import-dynamic. This commit, based on https://github.com/llvm/llvm-project/pull/109249, fixes it by checking sym->isShared() explictly. (as a bonus, you don't need to rely on --unresolved-symbols=import-dynamic anymore.) Fixes https://github.com/llvm/llvm-project/issues/107387 --- lld/test/wasm/dylink-non-pie.s | 38 ++++++++++++++++++++++++++++++++++ lld/wasm/Relocations.cpp | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100755 lld/test/wasm/dylink-non-pie.s diff --git a/lld/test/wasm/dylink-non-pie.s b/lld/test/wasm/dylink-non-pie.s new file mode 100755 index 0000000000000..3157b8c32120f --- /dev/null +++ b/lld/test/wasm/dylink-non-pie.s @@ -0,0 +1,38 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.lib.o %p/Inputs/ret32.s +# RUN: wasm-ld -m wasm32 --experimental-pic -shared --no-entry %t.lib.o -o %t.lib.so +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: wasm-ld -m wasm32 -Bdynamic %t.o %t.lib.so -o %t.wasm +# RUN: obj2yaml %t.wasm | FileCheck %s +# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS + + .functype ret32 (f32) -> (i32) + .globl _start +_start: + .functype _start () -> () + i32.const f_p + drop + end_function + + .section .data.f_p,"",@ +f_p: + .int32 ret32 + .size f_p, 4 + +# CHECK: Sections: +# CHECK-NEXT: - Type: CUSTOM +# CHECK-NEXT: Name: dylink.0 + +# non-pie executable doesn't import __memory_base +# CHECK: - Type: IMPORT +# CHECK-NOT: Field: __memory_base + +# CHECK: - Type: EXPORT +# CHECK: - Name: __wasm_apply_data_relocs +# CHECK-NEXT: Kind: FUNCTION + +# DIS: <__wasm_apply_data_relocs>: +# DIS-EMPTY: +# DIS-NEXT: i32.const 1024 +# DIS-NEXT: global.get 0 +# DIS-NEXT: i32.store 0 +# DIS-NEXT: end diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 745dfde76ab70..52888ad25034e 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -144,7 +144,7 @@ void scanRelocations(InputChunk *chunk) { break; } - if (ctx.isPic || + if (ctx.isPic || sym->isShared() || (sym->isUndefined() && ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) { switch (reloc.Type) { From e4372c4454c963c9f52dbf2a10229797f3f1e6fc Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 3 Jan 2025 11:23:44 +0800 Subject: [PATCH 046/480] [LoongArch] Pre-commit tests for tls-desc scheduling. NFC (#121538) Code sequence for tls-desc in large code model is not expected to be scheduled according to psABI 2.30. A later commit will fix it. --- .../LoongArch/psabi-restricted-scheduling.ll | 75 ++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll index c7de3dcf2ecfd..1773b8e014997 100644 --- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=0 < %s \ ; RUN: | FileCheck %s --check-prefix=MEDIUM_NO_SCH ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=1 < %s \ @@ -7,6 +6,14 @@ ; RUN: | FileCheck %s --check-prefix=LARGE_NO_SCH ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=1 < %s \ ; RUN: | FileCheck %s --check-prefix=LARGE_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=MEDIUMDESC_NO_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=MEDIUMDESC_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=LARGEDESC_NO_SCH +; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \ +; RUN: --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=LARGEDESC_SCH @g = dso_local global i64 zeroinitializer, align 4 @G = global i64 zeroinitializer, align 4 @@ -194,3 +201,69 @@ define void @foo() nounwind { %v_ie = load volatile i64, ptr @ie ret void } + +define void @baz() nounwind { +; MEDIUMDESC_NO_SCH-LABEL: baz: +; MEDIUMDESC_NO_SCH: # %bb.0: +; MEDIUMDESC_NO_SCH-NEXT: addi.d $sp, $sp, -16 +; MEDIUMDESC_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; MEDIUMDESC_NO_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; MEDIUMDESC_NO_SCH-NEXT: addi.d $a0, $a0, %desc_pc_lo12(gd) +; MEDIUMDESC_NO_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; MEDIUMDESC_NO_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; MEDIUMDESC_NO_SCH-NEXT: add.d $a0, $a0, $tp +; MEDIUMDESC_NO_SCH-NEXT: ld.d $zero, $a0, 0 +; MEDIUMDESC_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; MEDIUMDESC_NO_SCH-NEXT: addi.d $sp, $sp, 16 +; MEDIUMDESC_NO_SCH-NEXT: ret +; +; MEDIUMDESC_SCH-LABEL: baz: +; MEDIUMDESC_SCH: # %bb.0: +; MEDIUMDESC_SCH-NEXT: addi.d $sp, $sp, -16 +; MEDIUMDESC_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; MEDIUMDESC_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; MEDIUMDESC_SCH-NEXT: addi.d $a0, $a0, %desc_pc_lo12(gd) +; MEDIUMDESC_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; MEDIUMDESC_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; MEDIUMDESC_SCH-NEXT: add.d $a0, $a0, $tp +; MEDIUMDESC_SCH-NEXT: ld.d $zero, $a0, 0 +; MEDIUMDESC_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; MEDIUMDESC_SCH-NEXT: addi.d $sp, $sp, 16 +; MEDIUMDESC_SCH-NEXT: ret +; +; LARGEDESC_NO_SCH-LABEL: baz: +; LARGEDESC_NO_SCH: # %bb.0: +; LARGEDESC_NO_SCH-NEXT: addi.d $sp, $sp, -16 +; LARGEDESC_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LARGEDESC_NO_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; LARGEDESC_NO_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) +; LARGEDESC_NO_SCH-NEXT: lu32i.d $a1, %desc64_pc_lo20(gd) +; LARGEDESC_NO_SCH-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(gd) +; LARGEDESC_NO_SCH-NEXT: add.d $a0, $a0, $a1 +; LARGEDESC_NO_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; LARGEDESC_NO_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; LARGEDESC_NO_SCH-NEXT: add.d $a0, $a0, $tp +; LARGEDESC_NO_SCH-NEXT: ld.d $zero, $a0, 0 +; LARGEDESC_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LARGEDESC_NO_SCH-NEXT: addi.d $sp, $sp, 16 +; LARGEDESC_NO_SCH-NEXT: ret +; +; LARGEDESC_SCH-LABEL: baz: +; LARGEDESC_SCH: # %bb.0: +; LARGEDESC_SCH-NEXT: addi.d $sp, $sp, -16 +; LARGEDESC_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LARGEDESC_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) +; LARGEDESC_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; LARGEDESC_SCH-NEXT: lu32i.d $a1, %desc64_pc_lo20(gd) +; LARGEDESC_SCH-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(gd) +; LARGEDESC_SCH-NEXT: add.d $a0, $a0, $a1 +; LARGEDESC_SCH-NEXT: ld.d $ra, $a0, %desc_ld(gd) +; LARGEDESC_SCH-NEXT: jirl $ra, $ra, %desc_call(gd) +; LARGEDESC_SCH-NEXT: add.d $a0, $a0, $tp +; LARGEDESC_SCH-NEXT: ld.d $zero, $a0, 0 +; LARGEDESC_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LARGEDESC_SCH-NEXT: addi.d $sp, $sp, 16 +; LARGEDESC_SCH-NEXT: ret + %v_gd = load volatile i64, ptr @gd + ret void +} From 56e944bede9654127cc210506a6cccdd43cd96e7 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 2 Jan 2025 20:13:18 -0800 Subject: [PATCH 047/480] [NFC] add anonymous namespace to a couple classes (#121511) This ensures these classes are visible only to the appropriate translation unit and allows for more optimizations. --- llvm/lib/IR/SafepointIRVerifier.cpp | 2 ++ .../Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp index ed99d05975c23..d32852b796c20 100644 --- a/llvm/lib/IR/SafepointIRVerifier.cpp +++ b/llvm/lib/IR/SafepointIRVerifier.cpp @@ -289,6 +289,7 @@ static void PrintValueSet(raw_ostream &OS, IteratorTy Begin, IteratorTy End) { using AvailableValueSet = DenseSet; +namespace { /// State we compute and track per basic block. struct BasicBlockState { // Set of values available coming in, before the phi nodes @@ -305,6 +306,7 @@ struct BasicBlockState { // contribute to AvailableOut. bool Cleared = false; }; +} // namespace /// A given derived pointer can have multiple base pointers through phi/selects. /// This type indicates when the base pointer is exclusively constant diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 45ee2d472a11b..12ae6740e055e 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -181,6 +181,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain /// of 'and' ops, then we also need to capture the fact that we saw an /// "and X, 1", so that's an extra return value for that case. +namespace { struct MaskOps { Value *Root = nullptr; APInt Mask; @@ -190,6 +191,7 @@ struct MaskOps { MaskOps(unsigned BitWidth, bool MatchAnds) : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {} }; +} // namespace /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a /// chain of 'and' or 'or' instructions looking for shift ops of a common source From b6c06d1a8d9b359e7319312a2a7654f0e7c6690c Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 15:18:35 +1100 Subject: [PATCH 048/480] [ORC] Fix bug in source file name finding in DebuggerSupportPlugin. The debug section map was using MachO section names (with the "__" prefix), but DWARFContext expects section names with the object format prefix stripped off. This was preventing DWARFContext from accessing the debug_str section, resulting in bogus source name strings. --- .../Orc/Debugging/DebuggerSupportPlugin.cpp | 26 +- .../x86-64/MachO-check-dwarf-filename.s | 315 ++++++++++++++++++ 2 files changed, 333 insertions(+), 8 deletions(-) create mode 100644 llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp index c08e52e943c92..0d9a912e25606 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp @@ -148,7 +148,7 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase { DSec.BuilderSec->align = Log2_64(SR.getFirstBlock()->getAlignment()); StringRef SectionData(SR.getFirstBlock()->getContent().data(), SR.getFirstBlock()->getSize()); - DebugSectionMap[SecName] = + DebugSectionMap[SecName.drop_front(2)] = // drop "__" prefix. MemoryBuffer::getMemBuffer(SectionData, G.getName(), false); if (SecName == "__debug_line") DebugLineSectionData = SectionData; @@ -167,11 +167,10 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase { DebugLineSectionData, G.getEndianness() == llvm::endianness::little, G.getPointerSize()); uint64_t Offset = 0; - DWARFDebugLine::LineTable LineTable; + DWARFDebugLine::Prologue P; // Try to parse line data. Consume error on failure. - if (auto Err = LineTable.parse(DebugLineData, &Offset, *DWARFCtx, nullptr, - consumeError)) { + if (auto Err = P.parse(DebugLineData, &Offset, consumeError, *DWARFCtx)) { handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) { LLVM_DEBUG({ dbgs() << "Cannot parse line table for \"" << G.getName() << "\": "; @@ -180,15 +179,26 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase { }); }); } else { - if (!LineTable.Prologue.FileNames.empty()) - FileName = *dwarf::toString(LineTable.Prologue.FileNames[0].Name); + for (auto &FN : P.FileNames) + if ((FileName = dwarf::toString(FN.Name))) { + LLVM_DEBUG({ + dbgs() << "Using FileName = \"" << *FileName + << "\" from DWARF line table\n"; + }); + break; + } } } // If no line table (or unable to use) then use graph name. // FIXME: There are probably other debug sections we should look in first. - if (!FileName) - FileName = StringRef(G.getName()); + if (!FileName) { + LLVM_DEBUG({ + dbgs() << "Could not find source name from DWARF line table. " + "Using FileName = \"\"\n"; + }); + FileName = ""; + } Builder.addSymbol("", MachO::N_SO, 0, 0, 0); Builder.addSymbol(*FileName, MachO::N_SO, 0, 0, 0); diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s new file mode 100644 index 0000000000000..058ef55fd1e3c --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s @@ -0,0 +1,315 @@ +# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \ +# RUN: FileCheck %s +# +# Test that source file names can be indentified from DWARF line tables. + +# CHECK: Using FileName = "check-dwarf-filename.c" from DWARF line table + + .section __TEXT,__text,regular,pure_instructions + .build_version macos, 15, 0 sdk_version 15, 0 + .globl _main ## -- Begin function main + .p2align 4, 0x90 +_main: ## @main +Lfunc_begin0: + .file 0 "/Users/lhames/Projects/scratch" "check-dwarf-filename.c" md5 0x331a6c7ae0cfcd2896eca60ac6f5703e + .loc 0 1 0 ## check-dwarf-filename.c:1:0 + .cfi_startproc +## %bb.0: + ##DEBUG_VALUE: main:argc <- $edi + ##DEBUG_VALUE: main:argv <- $rsi + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +Ltmp0: + .loc 0 2 3 prologue_end ## check-dwarf-filename.c:2:3 + xorl %eax, %eax + .loc 0 2 3 epilogue_begin is_stmt 0 ## check-dwarf-filename.c:2:3 + popq %rbp + retq +Ltmp1: +Lfunc_end0: + .cfi_endproc + ## -- End function + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 1 ## DW_CHILDREN_yes + .byte 37 ## DW_AT_producer + .byte 37 ## DW_FORM_strx1 + .byte 19 ## DW_AT_language + .byte 5 ## DW_FORM_data2 + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .ascii "\202|" ## DW_AT_LLVM_sysroot + .byte 37 ## DW_FORM_strx1 + .ascii "\357\177" ## DW_AT_APPLE_sdk + .byte 37 ## DW_FORM_strx1 + .byte 114 ## DW_AT_str_offsets_base + .byte 23 ## DW_FORM_sec_offset + .byte 16 ## DW_AT_stmt_list + .byte 23 ## DW_FORM_sec_offset + .byte 27 ## DW_AT_comp_dir + .byte 37 ## DW_FORM_strx1 + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 17 ## DW_AT_low_pc + .byte 27 ## DW_FORM_addrx + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 115 ## DW_AT_addr_base + .byte 23 ## DW_FORM_sec_offset + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 2 ## Abbreviation Code + .byte 46 ## DW_TAG_subprogram + .byte 1 ## DW_CHILDREN_yes + .byte 17 ## DW_AT_low_pc + .byte 27 ## DW_FORM_addrx + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 64 ## DW_AT_frame_base + .byte 24 ## DW_FORM_exprloc + .byte 122 ## DW_AT_call_all_calls + .byte 25 ## DW_FORM_flag_present + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 39 ## DW_AT_prototyped + .byte 25 ## DW_FORM_flag_present + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 63 ## DW_AT_external + .byte 25 ## DW_FORM_flag_present + .ascii "\341\177" ## DW_AT_APPLE_optimized + .byte 25 ## DW_FORM_flag_present + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 3 ## Abbreviation Code + .byte 5 ## DW_TAG_formal_parameter + .byte 0 ## DW_CHILDREN_no + .byte 2 ## DW_AT_location + .byte 24 ## DW_FORM_exprloc + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 4 ## Abbreviation Code + .byte 36 ## DW_TAG_base_type + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 37 ## DW_FORM_strx1 + .byte 62 ## DW_AT_encoding + .byte 11 ## DW_FORM_data1 + .byte 11 ## DW_AT_byte_size + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 5 ## Abbreviation Code + .byte 15 ## DW_TAG_pointer_type + .byte 0 ## DW_CHILDREN_no + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 0 ## EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: +.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 5 ## DWARF version number + .byte 1 ## DWARF Unit Type + .byte 8 ## Address Size (in bytes) +.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset1 + .byte 1 ## Abbrev [1] 0xc:0x50 DW_TAG_compile_unit + .byte 0 ## DW_AT_producer + .short 29 ## DW_AT_language + .byte 1 ## DW_AT_name + .byte 2 ## DW_AT_LLVM_sysroot + .byte 3 ## DW_AT_APPLE_sdk +.set Lset2, Lstr_offsets_base0-Lsection_str_off ## DW_AT_str_offsets_base + .long Lset2 +.set Lset3, Lline_table_start0-Lsection_line ## DW_AT_stmt_list + .long Lset3 + .byte 4 ## DW_AT_comp_dir + ## DW_AT_APPLE_optimized + .byte 0 ## DW_AT_low_pc +.set Lset4, Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset4 +.set Lset5, Laddr_table_base0-Lsection_info0 ## DW_AT_addr_base + .long Lset5 + .byte 2 ## Abbrev [2] 0x25:0x24 DW_TAG_subprogram + .byte 0 ## DW_AT_low_pc +.set Lset6, Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset6 + .byte 1 ## DW_AT_frame_base + .byte 86 + ## DW_AT_call_all_calls + .byte 5 ## DW_AT_name + .byte 0 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + ## DW_AT_prototyped + .long 73 ## DW_AT_type + ## DW_AT_external + ## DW_AT_APPLE_optimized + .byte 3 ## Abbrev [3] 0x34:0xa DW_TAG_formal_parameter + .byte 1 ## DW_AT_location + .byte 85 + .byte 7 ## DW_AT_name + .byte 0 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + .long 73 ## DW_AT_type + .byte 3 ## Abbrev [3] 0x3e:0xa DW_TAG_formal_parameter + .byte 1 ## DW_AT_location + .byte 84 + .byte 8 ## DW_AT_name + .byte 0 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + .long 77 ## DW_AT_type + .byte 0 ## End Of Children Mark + .byte 4 ## Abbrev [4] 0x49:0x4 DW_TAG_base_type + .byte 6 ## DW_AT_name + .byte 5 ## DW_AT_encoding + .byte 4 ## DW_AT_byte_size + .byte 5 ## Abbrev [5] 0x4d:0x5 DW_TAG_pointer_type + .long 82 ## DW_AT_type + .byte 5 ## Abbrev [5] 0x52:0x5 DW_TAG_pointer_type + .long 87 ## DW_AT_type + .byte 4 ## Abbrev [4] 0x57:0x4 DW_TAG_base_type + .byte 9 ## DW_AT_name + .byte 6 ## DW_AT_encoding + .byte 1 ## DW_AT_byte_size + .byte 0 ## End Of Children Mark +Ldebug_info_end0: + .section __DWARF,__debug_str_offs,regular,debug +Lsection_str_off: + .long 44 ## Length of String Offsets Set + .short 5 + .short 0 +Lstr_offsets_base0: + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "Apple clang version 16.0.0 (clang-1600.0.26.3)" ## string offset=0 + .asciz "check-dwarf-filename.c" ## string offset=47 + .asciz "/Library/Developer/CommandLineTools/SDKs/MacOSX15.0.sdk" ## string offset=70 + .asciz "MacOSX15.0.sdk" ## string offset=126 + .asciz "/Users/lhames/Projects/scratch" ## string offset=141 + .asciz "main" ## string offset=172 + .asciz "int" ## string offset=177 + .asciz "argc" ## string offset=181 + .asciz "argv" ## string offset=186 + .asciz "char" ## string offset=191 + .section __DWARF,__debug_str_offs,regular,debug + .long 0 + .long 47 + .long 70 + .long 126 + .long 141 + .long 172 + .long 177 + .long 181 + .long 186 + .long 191 + .section __DWARF,__debug_addr,regular,debug +Lsection_info0: +.set Lset7, Ldebug_addr_end0-Ldebug_addr_start0 ## Length of contribution + .long Lset7 +Ldebug_addr_start0: + .short 5 ## DWARF version number + .byte 8 ## Address size + .byte 0 ## Segment selector size +Laddr_table_base0: + .quad Lfunc_begin0 +Ldebug_addr_end0: + .section __DWARF,__debug_names,regular,debug +Ldebug_names_begin: +.set Lset8, Lnames_end0-Lnames_start0 ## Header: unit length + .long Lset8 +Lnames_start0: + .short 5 ## Header: version + .short 0 ## Header: padding + .long 1 ## Header: compilation unit count + .long 0 ## Header: local type unit count + .long 0 ## Header: foreign type unit count + .long 3 ## Header: bucket count + .long 3 ## Header: name count +.set Lset9, Lnames_abbrev_end0-Lnames_abbrev_start0 ## Header: abbreviation table size + .long Lset9 + .long 8 ## Header: augmentation string size + .ascii "LLVM0700" ## Header: augmentation string +.set Lset10, Lcu_begin0-Lsection_info ## Compilation unit 0 + .long Lset10 + .long 0 ## Bucket 0 + .long 1 ## Bucket 1 + .long 2 ## Bucket 2 + .long 2090499946 ## Hash in Bucket 1 + .long 193495088 ## Hash in Bucket 2 + .long 2090147939 ## Hash in Bucket 2 + .long 172 ## String in Bucket 1: main + .long 177 ## String in Bucket 2: int + .long 191 ## String in Bucket 2: char +.set Lset11, Lnames0-Lnames_entries0 ## Offset in Bucket 1 + .long Lset11 +.set Lset12, Lnames1-Lnames_entries0 ## Offset in Bucket 2 + .long Lset12 +.set Lset13, Lnames2-Lnames_entries0 ## Offset in Bucket 2 + .long Lset13 +Lnames_abbrev_start0: + .ascii "\230." ## Abbrev code + .byte 46 ## DW_TAG_subprogram + .byte 3 ## DW_IDX_die_offset + .byte 19 ## DW_FORM_ref4 + .byte 4 ## DW_IDX_parent + .byte 25 ## DW_FORM_flag_present + .byte 0 ## End of abbrev + .byte 0 ## End of abbrev + .ascii "\230$" ## Abbrev code + .byte 36 ## DW_TAG_base_type + .byte 3 ## DW_IDX_die_offset + .byte 19 ## DW_FORM_ref4 + .byte 4 ## DW_IDX_parent + .byte 25 ## DW_FORM_flag_present + .byte 0 ## End of abbrev + .byte 0 ## End of abbrev + .byte 0 ## End of abbrev list +Lnames_abbrev_end0: +Lnames_entries0: +Lnames0: +L1: + .ascii "\230." ## Abbreviation code + .long 37 ## DW_IDX_die_offset + .byte 0 ## DW_IDX_parent + ## End of list: main +Lnames1: +L0: + .ascii "\230$" ## Abbreviation code + .long 73 ## DW_IDX_die_offset + .byte 0 ## DW_IDX_parent + ## End of list: int +Lnames2: +L2: + .ascii "\230$" ## Abbreviation code + .long 87 ## DW_IDX_die_offset + .byte 0 ## DW_IDX_parent + ## End of list: char + .p2align 2, 0x0 +Lnames_end0: +.subsections_via_symbols + .section __DWARF,__debug_line,regular,debug +Lsection_line: +Lline_table_start0: From 1c997feff16860ab6b21c5c03dc7ca65f300967f Mon Sep 17 00:00:00 2001 From: dmasloff <74042473+dmasloff@users.noreply.github.com> Date: Fri, 3 Jan 2025 08:52:01 +0300 Subject: [PATCH 049/480] [clang-format] Add option WrapNamespaceBodyWithNewlines (#106145) It wraps the body of namespace with additional newlines, turning this code: ``` namespace N { int function(); } ``` into the following: ``` namespace N { int function(); } ``` --------- Co-authored-by: Owen Pan --- clang/docs/ClangFormatStyleOptions.rst | 39 ++++++ clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Format/Format.h | 36 +++++- clang/lib/Format/Format.cpp | 15 +++ clang/lib/Format/UnwrappedLineFormatter.cpp | 17 +++ clang/unittests/Format/ConfigParseTest.cpp | 7 ++ clang/unittests/Format/FormatTest.cpp | 130 ++++++++++++++++++++ 7 files changed, 244 insertions(+), 1 deletion(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index d9b3f666df03c..7bfaee4e2d35b 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -6843,6 +6843,45 @@ the configuration (without a prefix: ``Auto``). For example: BOOST_PP_STRINGIZE +.. _WrapNamespaceBodyWithEmptyLines: + +**WrapNamespaceBodyWithEmptyLines** (``WrapNamespaceBodyWithEmptyLinesStyle``) :versionbadge:`clang-format 20` :ref:`¶ ` + Wrap namespace body with empty lines. + + Possible values: + + * ``WNBWELS_Never`` (in configuration: ``Never``) + Remove all empty lines at the beginning and the end of namespace body. + + .. code-block:: c++ + + namespace N1 { + namespace N2 + function(); + } + } + + * ``WNBWELS_Always`` (in configuration: ``Always``) + Always have at least one empty line at the beginning and the end of + namespace body except that the number of empty lines between consecutive + nested namespace definitions is not increased. + + .. code-block:: c++ + + namespace N1 { + namespace N2 { + + function(); + + } + } + + * ``WNBWELS_Leave`` (in configuration: ``Leave``) + Keep existing newlines at the beginning and the end of namespace body. + ``MaxEmptyLinesToKeep`` still applies. + + + .. END_FORMAT_STYLE_OPTIONS Adding additional style options diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index aca07e2ba9cf2..2789a24ebf273 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1127,6 +1127,7 @@ clang-format - Adds ``AllowShortNamespacesOnASingleLine`` option. - Adds ``VariableTemplates`` option. - Adds support for bash globstar in ``.clang-format-ignore``. +- Adds ``WrapNamespaceBodyWithEmptyLines`` option. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index bb34f2d33ac15..9b7a633e0a146 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -5143,6 +5143,39 @@ struct FormatStyle { /// \version 11 std::vector WhitespaceSensitiveMacros; + /// Different styles for wrapping namespace body with empty lines. + enum WrapNamespaceBodyWithEmptyLinesStyle : int8_t { + /// Remove all empty lines at the beginning and the end of namespace body. + /// \code + /// namespace N1 { + /// namespace N2 + /// function(); + /// } + /// } + /// \endcode + WNBWELS_Never, + /// Always have at least one empty line at the beginning and the end of + /// namespace body except that the number of empty lines between consecutive + /// nested namespace definitions is not increased. + /// \code + /// namespace N1 { + /// namespace N2 { + /// + /// function(); + /// + /// } + /// } + /// \endcode + WNBWELS_Always, + /// Keep existing newlines at the beginning and the end of namespace body. + /// ``MaxEmptyLinesToKeep`` still applies. + WNBWELS_Leave + }; + + /// Wrap namespace body with empty lines. + /// \version 20 + WrapNamespaceBodyWithEmptyLinesStyle WrapNamespaceBodyWithEmptyLines; + bool operator==(const FormatStyle &R) const { return AccessModifierOffset == R.AccessModifierOffset && AlignAfterOpenBracket == R.AlignAfterOpenBracket && @@ -5326,7 +5359,8 @@ struct FormatStyle { UseTab == R.UseTab && VariableTemplates == R.VariableTemplates && VerilogBreakBetweenInstancePorts == R.VerilogBreakBetweenInstancePorts && - WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros; + WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros && + WrapNamespaceBodyWithEmptyLines == R.WrapNamespaceBodyWithEmptyLines; } std::optional GetLanguageStyle(LanguageKind Language) const; diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index a5657f2d910f6..e51d7ac2e5b6c 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -839,6 +839,18 @@ template <> struct ScalarEnumerationTraits { } }; +template <> +struct ScalarEnumerationTraits< + FormatStyle::WrapNamespaceBodyWithEmptyLinesStyle> { + static void + enumeration(IO &IO, + FormatStyle::WrapNamespaceBodyWithEmptyLinesStyle &Value) { + IO.enumCase(Value, "Never", FormatStyle::WNBWELS_Never); + IO.enumCase(Value, "Always", FormatStyle::WNBWELS_Always); + IO.enumCase(Value, "Leave", FormatStyle::WNBWELS_Leave); + } +}; + template <> struct MappingTraits { static void mapping(IO &IO, FormatStyle &Style) { // When reading, read the language first, we need it for getPredefinedStyle. @@ -1171,6 +1183,8 @@ template <> struct MappingTraits { Style.VerilogBreakBetweenInstancePorts); IO.mapOptional("WhitespaceSensitiveMacros", Style.WhitespaceSensitiveMacros); + IO.mapOptional("WrapNamespaceBodyWithEmptyLines", + Style.WrapNamespaceBodyWithEmptyLines); // If AlwaysBreakAfterDefinitionReturnType was specified but // BreakAfterReturnType was not, initialize the latter from the former for @@ -1639,6 +1653,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.WhitespaceSensitiveMacros.push_back("NS_SWIFT_NAME"); LLVMStyle.WhitespaceSensitiveMacros.push_back("PP_STRINGIZE"); LLVMStyle.WhitespaceSensitiveMacros.push_back("STRINGIZE"); + LLVMStyle.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Leave; LLVMStyle.PenaltyBreakAssignment = prec::Assignment; LLVMStyle.PenaltyBreakBeforeFirstCallParameter = 19; diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 803c600cec44d..bc6766a47f5c7 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -1584,6 +1584,23 @@ static auto computeNewlines(const AnnotatedLine &Line, Newlines = 1; } + if (Style.WrapNamespaceBodyWithEmptyLines != FormatStyle::WNBWELS_Leave) { + // Modify empty lines after TT_NamespaceLBrace. + if (PreviousLine && PreviousLine->endsWith(TT_NamespaceLBrace)) { + if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never) + Newlines = 1; + else if (!Line.startsWithNamespace()) + Newlines = std::max(Newlines, 2u); + } + // Modify empty lines before TT_NamespaceRBrace. + if (Line.startsWith(TT_NamespaceRBrace)) { + if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never) + Newlines = 1; + else if (!PreviousLine->startsWith(TT_NamespaceRBrace)) + Newlines = std::max(Newlines, 2u); + } + } + // Insert or remove empty line before access specifiers. if (PreviousLine && RootToken.isAccessSpecifier()) { switch (Style.EmptyLineBeforeAccessModifier) { diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index b249bf073aa45..9c38dbbc51f0a 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -865,6 +865,13 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("SortUsingDeclarations: true", SortUsingDeclarations, FormatStyle::SUD_LexicographicNumeric); + CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Never", + WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Never); + CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Always", + WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Always); + CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Leave", + WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Leave); + // FIXME: This is required because parsing a configuration simply overwrites // the first N elements of the list instead of resetting it. Style.ForEachMacros.clear(); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 22b6f7e1b62e2..44b9dba249890 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -28427,6 +28427,136 @@ TEST_F(FormatTest, ShortNamespacesOption) { Style); } +TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesNever) { + auto Style = getLLVMStyle(); + Style.FixNamespaceComments = false; + Style.MaxEmptyLinesToKeep = 2; + Style.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Never; + + // Empty namespace. + verifyFormat("namespace N {}", Style); + + // Single namespace. + verifyFormat("namespace N {\n" + "int f1(int a) { return 2 * a; }\n" + "}", + "namespace N {\n" + "\n" + "\n" + "int f1(int a) { return 2 * a; }\n" + "\n" + "\n" + "}", + Style); + + // Nested namespace. + verifyFormat("namespace N1 {\n" + "namespace N2 {\n" + "int a = 1;\n" + "}\n" + "}", + "namespace N1 {\n" + "\n" + "\n" + "namespace N2 {\n" + "\n" + "int a = 1;\n" + "\n" + "}\n" + "\n" + "\n" + "}", + Style); + + Style.CompactNamespaces = true; + + verifyFormat("namespace N1 { namespace N2 {\n" + "int a = 1;\n" + "}}", + "namespace N1 { namespace N2 {\n" + "\n" + "\n" + "int a = 1;\n" + "\n" + "\n" + "}}", + Style); +} + +TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesAlways) { + auto Style = getLLVMStyle(); + Style.FixNamespaceComments = false; + Style.MaxEmptyLinesToKeep = 2; + Style.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Always; + + // Empty namespace. + verifyFormat("namespace N {}", Style); + + // Single namespace. + verifyFormat("namespace N {\n" + "\n" + "int f1(int a) { return 2 * a; }\n" + "\n" + "}", + "namespace N {\n" + "int f1(int a) { return 2 * a; }\n" + "}", + Style); + + // Nested namespace. + verifyFormat("namespace N1 {\n" + "namespace N2 {\n" + "\n" + "int a = 1;\n" + "\n" + "}\n" + "}", + "namespace N1 {\n" + "namespace N2 {\n" + "int a = 1;\n" + "}\n" + "}", + Style); + + verifyFormat("namespace N1 {\n" + "\n" + "namespace N2 {\n" + "\n" + "\n" + "int a = 1;\n" + "\n" + "\n" + "}\n" + "\n" + "}", + "namespace N1 {\n" + "\n" + "namespace N2 {\n" + "\n" + "\n" + "\n" + "int a = 1;\n" + "\n" + "\n" + "\n" + "}\n" + "\n" + "}", + Style); + + Style.CompactNamespaces = true; + + verifyFormat("namespace N1 { namespace N2 {\n" + "\n" + "int a = 1;\n" + "\n" + "}}", + "namespace N1 { namespace N2 {\n" + "int a = 1;\n" + "}}", + Style); +} + } // namespace } // namespace test } // namespace format From 72db3f989e499c8c5d585d3624cd563600cd2396 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Fri, 3 Jan 2025 14:05:02 +0800 Subject: [PATCH 050/480] [RISCV] Allow tail memcmp expansion (#121460) This optimization was introduced by #70469. Like AArch64, we allow tail expansions for 3 on RV32 and 3/5/6 on RV64. This can simplify the comparison and reduce the number of blocks. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 7 +- llvm/test/CodeGen/RISCV/memcmp-optsize.ll | 234 ++++++++---------- llvm/test/CodeGen/RISCV/memcmp.ll | 234 ++++++++---------- 3 files changed, 201 insertions(+), 274 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 909a64e974255..850d6244affa5 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2567,9 +2567,12 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { Options.AllowOverlappingLoads = true; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); Options.NumLoadsPerBlock = Options.MaxNumLoads; - if (ST->is64Bit()) + if (ST->is64Bit()) { Options.LoadSizes = {8, 4, 2, 1}; - else + Options.AllowedTailExpansions = {3, 5, 6}; + } else { Options.LoadSizes = {4, 2, 1}; + Options.AllowedTailExpansions = {3}; + } return Options; } diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index d529ae6ecd0ab..b9a27b9d0c9e7 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2449,82 +2449,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -2845,22 +2835,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -2883,22 +2870,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3052,28 +3034,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3102,28 +3075,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 860c3a94abc0a..629a9298ee469 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3145,82 +3145,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB24_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 2(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB24_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a2, 2(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a2, a2, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -3541,22 +3531,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -3579,22 +3566,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB26_2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB26_2: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3748,28 +3730,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a3, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3798,28 +3771,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 4(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB27_3: # %res_block -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: From 93a68a5188b6aa940f51d8ce0317299409f828ae Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 17:29:43 +1100 Subject: [PATCH 051/480] [ORC] Testcase requires asserts as it depends on debugging output. --- .../ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s index 058ef55fd1e3c..a2eee21a0761d 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s @@ -2,6 +2,8 @@ # RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \ # RUN: FileCheck %s # +# REQUIRES: asserts +# # Test that source file names can be indentified from DWARF line tables. # CHECK: Using FileName = "check-dwarf-filename.c" from DWARF line table From febe1a9d286df495ca342011b3134823eee37557 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 15:47:43 +1100 Subject: [PATCH 052/480] [ORC] Use structured binding to improve readability. NFC. --- llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h index 6ffd286c365ac..8e29f219774b3 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h @@ -460,8 +460,8 @@ template class MachOBuilder { return; StrTab.resize(Strings.size()); - for (auto &KV : Strings) - StrTab[KV.second] = {KV.first, 0}; + for (auto &[Str, Idx] : Strings) + StrTab[Idx] = {Str, 0}; size_t Offset = 0; for (auto &Elem : StrTab) { Elem.Offset = Offset; From 30b73ed7bd8934c32e4bd5430bccf52a226deabd Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 17:15:33 +1100 Subject: [PATCH 053/480] [ORC][MachO] Avoid another race condition in MachOPlatform bootstrap. Similar to a9e75b1d4d1: During MachOPlatform bootstrap we need to defer actions until essential platform functionality has been loaded, but the platform itself may be loaded under a concurrent dispatcher so we have to guard against the deferred actions vector being accessed concurrently. This fixes a probablistic failure in the ORC runtime regression tests on Darwin/x86-64 that was spotted after edca1d9bad2 (which turned on concurrent linking by default in llvm-jitlink). --- llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 0e8349711e6fe..9f324c7048c63 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -937,6 +937,12 @@ Error MachOPlatform::MachOPlatformPlugin::bootstrapPipelineEnd( jitlink::LinkGraph &G) { std::lock_guard Lock(MP.Bootstrap.load()->Mutex); assert(MP.Bootstrap && "DeferredAAs reset before bootstrap completed"); + + // Transfer any allocation actions to DeferredAAs. + std::move(G.allocActions().begin(), G.allocActions().end(), + std::back_inserter(MP.Bootstrap.load()->DeferredAAs)); + G.allocActions().clear(); + --MP.Bootstrap.load()->ActiveGraphs; // Notify Bootstrap->CV while holding the mutex because the mutex is // also keeping Bootstrap->CV alive. @@ -1397,10 +1403,6 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections( SPSExecutorAddrRange, SPSExecutorAddrRange>>, SPSSequence>>; - shared::AllocActions &allocActions = LLVM_LIKELY(!InBootstrapPhase) - ? G.allocActions() - : MP.Bootstrap.load()->DeferredAAs; - ExecutorAddr HeaderAddr; { std::lock_guard Lock(MP.PlatformMutex); @@ -1410,7 +1412,7 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections( assert(I->second && "Null header registered for JD"); HeaderAddr = I->second; } - allocActions.push_back( + G.allocActions().push_back( {cantFail( WrapperFunctionCall::Create( MP.RegisterObjectPlatformSections.Addr, HeaderAddr, UnwindInfo, From 82fecab85ae2d72ffac0e44749d99f12d6f71cc0 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 2 Jan 2025 23:01:28 -0800 Subject: [PATCH 054/480] [gcov] Bump default version to 11.1 The gcov version is set to 11.1 (compatible with gcov 9) even if `-Xclang -coverage-version=` specified version is less than 11.1. Therefore, we can drop producer support for version < 11.1. --- clang/include/clang/Basic/CodeGenOptions.h | 2 +- clang/lib/Basic/CodeGenOptions.cpp | 2 - clang/lib/Frontend/CompilerInvocation.cpp | 3 +- clang/test/CodeGen/code-coverage.c | 26 +++----- .../Inputs/instrprof-gcov-exceptions.cpp.gcov | 1 - ...rprof-gcov-multiple-bbs-single-line.c.gcov | 1 - .../instrprof-gcov-one-line-function.c.gcov | 1 - .../Inputs/instrprof-gcov-switch1.c.gcov | 1 - .../Inputs/instrprof-gcov-switch2.c.gcov | 1 - .../instrprof-shared-lib_in-loop.c.gcov | 1 - .../Inputs/instrprof-shared-main.c.gcov | 1 - .../profile/gcov-__gcov_flush-terminate.c | 1 - .../Instrumentation/GCOVProfiling.cpp | 64 +++++++------------ .../Transforms/GCOVProfiling/exit-block.ll | 2 +- llvm/test/Transforms/GCOVProfiling/version.ll | 8 +-- 15 files changed, 41 insertions(+), 74 deletions(-) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index 8097c9ef772bc..c555fb3b72d64 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -186,7 +186,7 @@ class CodeGenOptions : public CodeGenOptionsBase { std::string ProfileExcludeFiles; /// The version string to put into coverage files. - char CoverageVersion[4]; + char CoverageVersion[4] = {'0', '0', '0', '0'}; /// Enable additional debugging information. std::string DebugPass; diff --git a/clang/lib/Basic/CodeGenOptions.cpp b/clang/lib/Basic/CodeGenOptions.cpp index 79d715305ef20..95e65ba9266f5 100644 --- a/clang/lib/Basic/CodeGenOptions.cpp +++ b/clang/lib/Basic/CodeGenOptions.cpp @@ -17,7 +17,6 @@ CodeGenOptions::CodeGenOptions() { #include "clang/Basic/CodeGenOptions.def" RelocationModel = llvm::Reloc::PIC_; - memcpy(CoverageVersion, "408*", 4); } void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) { @@ -54,7 +53,6 @@ void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) { } RelocationModel = llvm::Reloc::PIC_; - memcpy(CoverageVersion, "408*", 4); } } // end namespace clang diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 0ae6dce5dd40a..36dc45bde11ab 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1691,7 +1691,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, } } - if (memcmp(Opts.CoverageVersion, "408*", 4) != 0) + if (memcmp(Opts.CoverageVersion, "0000", 4)) GenerateArg(Consumer, OPT_coverage_version_EQ, StringRef(Opts.CoverageVersion, 4)); @@ -2007,7 +2007,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, } else if (Args.hasArg(OPT_fmemory_profile)) Opts.MemoryProfileOutput = MemProfileBasename; - memcpy(Opts.CoverageVersion, "408*", 4); if (Opts.CoverageNotesFile.size() || Opts.CoverageDataFile.size()) { if (Args.hasArg(OPT_coverage_version_EQ)) { StringRef CoverageVersion = Args.getLastArgValue(OPT_coverage_version_EQ); diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c index 4e3364df21785..5fa62360c9b56 100644 --- a/clang/test/CodeGen/code-coverage.c +++ b/clang/test/CodeGen/code-coverage.c @@ -3,18 +3,14 @@ /// 4.7 enables cfg_checksum. /// 4.8 (default, compatible with gcov 7) emits the exit block the second. // RUN: rm -rf %t && mkdir %t && cd %t -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,304 %s -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,407 %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='B21*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,1210 %s // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,408 %s -// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,304 %s -// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,407 %s +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,1110 %s +// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='B21*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,1210 %s // RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,408 %s +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,1110 %s // RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO @@ -48,12 +44,10 @@ int test2(int b) { // CHECK-SAME: [%emit_function_args_ty { i32 0, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }, %emit_function_args_ty { i32 1, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }] // CHECK: @__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %file_info] -/// 0x3330342a '3' '0' '4' '*' -// 304-SAME: i32 858797098 -/// 0x3430372a '4' '0' '7' '*' -// 407-SAME: i32 875575082 -/// 0x3430382a '4' '0' '8' '*' -// 408-SAME: i32 875575338 +/// 0x4231312a 'B' '1' '1' '*' +// 1110-SAME: i32 1110520106 +/// 0x4232312a 'B' '2' '1' '*' +// 1210-SAME: i32 1110585642 // Check for gcov initialization function pointers. // CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit" diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov index aa202763fd564..233fd142444a5 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-exceptions.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-exceptions.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: -: 1:#include // CHECK-NEXT: -: 2: // CHECK-NEXT: 1: 3:void asd(std::string i) { diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov index 9297073d21ef8..a25632d475b34 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-multiple-bbs-single-line.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-multiple-bbs-single-line.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT:function main called 1 returned 100% blocks executed 77% // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov index 5a570a04742df..4dc68177e0b75 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-one-line-function.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-one-line-function.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: 1: 1:void foo() { } // CHECK-NEXT: -: 2: // CHECK-NEXT: 1: 3:void bar() { } diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov index 741dff59954bc..2b4d67f9abbef 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-switch1.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-switch1.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ // CHECK-NEXT: 1: 3: int i = 22; diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov index c931365ddf484..f9501e0c870b2 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-gcov-switch2.gcno // CHECK-NEXT: -: 0:Data:instrprof-gcov-switch2.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: 1: 1:int main(void) // CHECK-NEXT: -: 2:{ // CHECK-NEXT: 1: 3: int i = 22; diff --git a/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov index 69350471312e3..d75a222977a0c 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-shared-lib.gcno // CHECK-NEXT: -: 0:Data:instrprof-shared-lib.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: -: 1:int g1 = 0; // CHECK-NEXT: -: 2:int g2 = 1; // CHECK-NEXT: -: 3: diff --git a/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov index a31a60238809a..24facb5e1a380 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov +++ b/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov @@ -2,7 +2,6 @@ // CHECK-NEXT: -: 0:Graph:instrprof-shared-main.gcno // CHECK-NEXT: -: 0:Data:instrprof-shared-main.gcda // CHECK-NEXT: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 // CHECK-NEXT: -: 1:extern int g1, g2; // CHECK-NEXT: -: 2:extern void foo(int n); // CHECK-NEXT: -: 3: diff --git a/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c b/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c index ca13a0896a7b2..96cf4296524d1 100644 --- a/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c +++ b/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c @@ -8,7 +8,6 @@ // RUN: llvm-cov gcov -t gcov-__gcov_flush-terminate.gcda | FileCheck %s // CHECK: -: 0:Runs:1 -// CHECK-NEXT: -: 0:Programs:1 void __gcov_dump(void); void __gcov_reset(void); diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index f9be7f933d31e..6e86ffdc80275 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -61,7 +61,7 @@ enum : uint32_t { }; static cl::opt DefaultGCOVVersion("default-gcov-version", - cl::init("408*"), cl::Hidden, + cl::init("0000"), cl::Hidden, cl::ValueRequired); static cl::opt AtomicCounter("gcov-atomic-counter", cl::Hidden, @@ -154,6 +154,7 @@ class GCOVProfiler { GCOVOptions Options; llvm::endianness Endian; raw_ostream *os; + int Version = 0; // Checksum, produced by hash of EdgeDestinations SmallVector FileChecksums; @@ -334,12 +335,9 @@ namespace { : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident), Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) { LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); - bool ExitBlockBeforeBody = Version >= 48; - uint32_t i = ExitBlockBeforeBody ? 2 : 1; + uint32_t i = 2; for (BasicBlock &BB : *F) Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++))); - if (!ExitBlockBeforeBody) - ReturnBlock.Number = i; std::string FunctionNameAndLine; raw_string_ostream FNLOS(FunctionNameAndLine); @@ -363,44 +361,28 @@ namespace { void writeOut(uint32_t CfgChecksum) { write(GCOV_TAG_FUNCTION); SmallString<128> Filename = getFilename(SP); - uint32_t BlockLen = - 2 + (Version >= 47) + wordsOfString(getFunctionName(SP)); - if (Version < 80) - BlockLen += wordsOfString(Filename) + 1; - else - BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90); + uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP)); + BlockLen += 1 + wordsOfString(Filename) + 4; write(BlockLen); write(Ident); write(FuncChecksum); - if (Version >= 47) - write(CfgChecksum); + write(CfgChecksum); writeString(getFunctionName(SP)); - if (Version < 80) { - writeString(Filename); - write(SP->getLine()); - } else { - write(SP->isArtificial()); // artificial - writeString(Filename); - write(SP->getLine()); // start_line - write(0); // start_column - // EndLine is the last line with !dbg. It is not the } line as in GCC, - // but good enough. - write(EndLine); - if (Version >= 90) - write(0); // end_column - } + + write(SP->isArtificial()); // artificial + writeString(Filename); + write(SP->getLine()); // start_line + write(0); // start_column + // EndLine is the last line with !dbg. It is not the } line as in GCC, + // but good enough. + write(EndLine); + write(0); // end_column // Emit count of blocks. write(GCOV_TAG_BLOCKS); - if (Version < 80) { - write(Blocks.size() + 2); - for (int i = Blocks.size() + 2; i; --i) - write(0); - } else { - write(1); - write(Blocks.size() + 2); - } + write(1); + write(Blocks.size() + 2); LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n"); // Emit edges between blocks. @@ -767,7 +749,6 @@ bool GCOVProfiler::emitProfileNotes( function_ref GetBFI, function_ref GetBPI, function_ref GetTLI) { - int Version; { uint8_t c3 = Options.Version[0]; uint8_t c2 = Options.Version[1]; @@ -775,6 +756,11 @@ bool GCOVProfiler::emitProfileNotes( Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0' : (c3 - '0') * 10 + c1 - '0'; } + // Emit .gcno files that are compatible with GCC 11.1. + if (Version < 111) { + Version = 111; + memcpy(Options.Version, "B11*", 4); + } bool EmitGCDA = Options.EmitData; for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) { @@ -973,10 +959,8 @@ bool GCOVProfiler::emitProfileNotes( out.write(Tmp, 4); } write(Stamp); - if (Version >= 90) - writeString(""); // unuseful current_working_directory - if (Version >= 80) - write(0); // unuseful has_unexecuted_blocks + writeString("."); // unuseful current_working_directory + write(0); // unuseful has_unexecuted_blocks for (auto &Func : Funcs) Func->writeOut(Stamp); diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll index 50c4dc4665c95..567e22222f580 100644 --- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll +++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll @@ -9,7 +9,7 @@ ; But we can optionally emit it last, to match GCC<4.8 (r189778). ; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2 -; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-LAST %s +; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/GCOVProfiling/version.ll b/llvm/test/Transforms/GCOVProfiling/version.ll index bfac2557da0b1..4751bc1bd6dc7 100644 --- a/llvm/test/Transforms/GCOVProfiling/version.ll +++ b/llvm/test/Transforms/GCOVProfiling/version.ll @@ -5,16 +5,16 @@ ; RUN: cat %t/little.txt %s %t/version.txt > %t/2 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/2 -; RUN: head -c8 %t/version.gcno | grep '^oncg.804' +; RUN: head -c8 %t/version.gcno | grep '^oncg.11B' ; RUN: rm %t/version.gcno ; RUN: not opt -passes=insert-gcov-profiling -default-gcov-version=asdfasdf -disable-output < %t/2 -; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='402*' -disable-output < %t/2 -; RUN: head -c8 %t/version.gcno | grep '^oncg.204' +; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='B21*' -disable-output < %t/2 +; RUN: head -c8 %t/version.gcno | grep '^oncg.12B' ; RUN: rm %t/version.gcno ; RUN: cat %t/big.txt %s %t/version.txt > %t/big.ll ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/big.ll -; RUN: head -c8 %t/version.gcno | grep '^gcno408.' +; RUN: head -c8 %t/version.gcno | grep '^gcnoB11.' define void @test() !dbg !5 { ret void, !dbg !8 From 3ef78188d0d39cd00429f77f1b300be9bdf85770 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Fri, 3 Jan 2025 16:41:18 +0800 Subject: [PATCH 055/480] [PowerPC] Use `RegisterClassInfo::getRegPressureSetLimit` (#120383) `RegisterClassInfo::getRegPressureSetLimit` is a wrapper of `TargetRegisterInfo::getRegPressureSetLimit` with some logics to adjust the limit by removing reserved registers. It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit` directly, just like the comment "This limit must be adjusted dynamically for reserved registers" said. Separate from https://github.com/llvm/llvm-project/pull/118787 --- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 44f6db5061e21..fa45a7fb7fabe 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -643,8 +643,8 @@ bool PPCInstrInfo::shouldReduceRegisterPressure( }; // For now we only care about float and double type fma. - unsigned VSSRCLimit = TRI->getRegPressureSetLimit( - *MBB->getParent(), PPC::RegisterPressureSets::VSSRC); + unsigned VSSRCLimit = + RegClassInfo->getRegPressureSetLimit(PPC::RegisterPressureSets::VSSRC); // Only reduce register pressure when pressure is high. return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] > From 27f30029741ecf023baece7b3dde1ff9011ffefc Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 3 Jan 2025 00:34:24 +0100 Subject: [PATCH 056/480] [llvm-(min-)tblgen] Avoid redundant source compilation (#114494) All the sources of `llvm-min-tblgen` are also used for `llvm-tblgen`, with identical compilation flags. Reuse the object files of `llvm-min-tblgen` for `llvm-tblgen` by applying the usual source structure of an executable: One file per executable which named after the executable name containing the (in this case trivial) main function, which just calls the tblgen_main in TableGen.cpp. This should also clear up any confusion (including mine) of where each executable's main function is. While this slightly reduces build time, the main motivation is ccache. Using the hard_link option, building the object files for `llvm-tblgen` will result in a hard link to the same object file already used for `llvm-min-tblgen`. To signal the build system that the file is new, ccache will update the file's time stamp. Unfortunately, time stamps are shared between all hard-linked files s.t. this will indirectly also update the time stamps for the object files used for `llvm-tblgen`. At the next run, Ninja will recognize this time stamp discrepancy to the expected stamp recorded in `.ninja_log` and rebuild those object files for `llvm-min-tblgen`, which again will also update the stamp for the `llvm-tblgen`... . This is especially annoying for tablegen because it means Ninja will re-run all tablegenning in every build. I am using the hard_link option because it reduces the cost of having multiple build-trees of the LLVM sources and reduces the wear to the SSD they are stored on. --- .../{ => Basic}/ARMTargetDefEmitter.cpp | 0 .../utils/TableGen/{ => Basic}/Attributes.cpp | 0 llvm/utils/TableGen/Basic/CMakeLists.txt | 7 ++++++ .../TableGen/{ => Basic}/DirectiveEmitter.cpp | 0 .../TableGen/{ => Basic}/IntrinsicEmitter.cpp | 4 ++-- .../{ => Basic}/RISCVTargetDefEmitter.cpp | 0 llvm/utils/TableGen/{ => Basic}/TableGen.cpp | 6 +++-- llvm/utils/TableGen/Basic/TableGen.h | 13 +++++++++++ llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp | 0 llvm/utils/TableGen/CMakeLists.txt | 23 ++++++++----------- llvm/utils/TableGen/llvm-min-tblgen.cpp | 18 +++++++++++++++ llvm/utils/TableGen/llvm-tblgen.cpp | 18 +++++++++++++++ 12 files changed, 71 insertions(+), 18 deletions(-) rename llvm/utils/TableGen/{ => Basic}/ARMTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/Attributes.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/DirectiveEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/IntrinsicEmitter.cpp (99%) rename llvm/utils/TableGen/{ => Basic}/RISCVTargetDefEmitter.cpp (100%) rename llvm/utils/TableGen/{ => Basic}/TableGen.cpp (94%) create mode 100644 llvm/utils/TableGen/Basic/TableGen.h rename llvm/utils/TableGen/{ => Basic}/VTEmitter.cpp (100%) create mode 100644 llvm/utils/TableGen/llvm-min-tblgen.cpp create mode 100644 llvm/utils/TableGen/llvm-tblgen.cpp diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/ARMTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Basic/Attributes.cpp similarity index 100% rename from llvm/utils/TableGen/Attributes.cpp rename to llvm/utils/TableGen/Basic/Attributes.cpp diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt index 41d737e8d418e..b058fba78eb05 100644 --- a/llvm/utils/TableGen/Basic/CMakeLists.txt +++ b/llvm/utils/TableGen/Basic/CMakeLists.txt @@ -9,8 +9,15 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB + ARMTargetDefEmitter.cpp + Attributes.cpp CodeGenIntrinsics.cpp + DirectiveEmitter.cpp + IntrinsicEmitter.cpp + RISCVTargetDefEmitter.cpp SDNodeProperties.cpp + TableGen.cpp + VTEmitter.cpp ) # Users may include its headers as "Basic/*.h" diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/DirectiveEmitter.cpp rename to llvm/utils/TableGen/Basic/DirectiveEmitter.cpp diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp similarity index 99% rename from llvm/utils/TableGen/IntrinsicEmitter.cpp rename to llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 093602c3da804..fc2b8908a35b8 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "Basic/CodeGenIntrinsics.h" -#include "Basic/SequenceToOffsetTable.h" +#include "CodeGenIntrinsics.h" +#include "SequenceToOffsetTable.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/RISCVTargetDefEmitter.cpp rename to llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp similarity index 94% rename from llvm/utils/TableGen/TableGen.cpp rename to llvm/utils/TableGen/Basic/TableGen.cpp index bea2a2e735dbe..80ac93f2b54fb 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/Basic/TableGen.cpp @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This file contains the main function for LLVM's TableGen. +// This file contains the global defintions (mostly command line parameters) +// shared between llvm-tblgen and llvm-min-tblgen. // //===----------------------------------------------------------------------===// +#include "TableGen.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -74,7 +76,7 @@ static TableGen::Emitter::Opt X[] = { {"print-sets", printSets, "Print expanded sets for testing DAG exprs"}, }; -int main(int argc, char **argv) { +int tblgen_main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h new file mode 100644 index 0000000000000..630aea62fcf90 --- /dev/null +++ b/llvm/utils/TableGen/Basic/TableGen.h @@ -0,0 +1,13 @@ +//===- TableGen.h ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shared entry point for llvm-tblgen and llvm-min-tblgen. +// +//===----------------------------------------------------------------------===// + +int tblgen_main(int argc, char **argv); diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/Basic/VTEmitter.cpp similarity index 100% rename from llvm/utils/TableGen/VTEmitter.cpp rename to llvm/utils/TableGen/Basic/VTEmitter.cpp diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index ba1e4aa01b48d..96a74c6fd89f7 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -11,14 +11,13 @@ set(LLVM_LINK_COMPONENTS Support) # build llvm/include. It must not depend on TableGenCommon, as # TableGenCommon depends on this already to generate things such as # ValueType definitions. +# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included +# into LLVMTableGenBasic to avoid redundant compilation and problems with build +# caches. +# At least one source file must be included directly to avoid CMake problems. +# E.g. CMake derives which linker to use from the types of sources added. add_tablegen(llvm-min-tblgen LLVM_HEADERS - TableGen.cpp - ARMTargetDefEmitter.cpp - Attributes.cpp - DirectiveEmitter.cpp - IntrinsicEmitter.cpp - RISCVTargetDefEmitter.cpp - VTEmitter.cpp + llvm-min-tblgen.cpp $ PARTIAL_SOURCES_INTENDED @@ -32,10 +31,8 @@ set(LLVM_LINK_COMPONENTS add_tablegen(llvm-tblgen LLVM DESTINATION "${LLVM_TOOLS_INSTALL_DIR}" EXPORT LLVM - ARMTargetDefEmitter.cpp AsmMatcherEmitter.cpp AsmWriterEmitter.cpp - Attributes.cpp CallingConvEmitter.cpp CodeEmitterGen.cpp CodeGenMapTable.cpp @@ -48,7 +45,6 @@ add_tablegen(llvm-tblgen LLVM DecoderEmitter.cpp DFAEmitter.cpp DFAPacketizerEmitter.cpp - DirectiveEmitter.cpp DisassemblerEmitter.cpp DXILEmitter.cpp ExegesisEmitter.cpp @@ -57,18 +53,15 @@ add_tablegen(llvm-tblgen LLVM GlobalISelEmitter.cpp InstrDocsEmitter.cpp InstrInfoEmitter.cpp - IntrinsicEmitter.cpp + llvm-tblgen.cpp MacroFusionPredicatorEmitter.cpp OptionParserEmitter.cpp OptionRSTEmitter.cpp PseudoLoweringEmitter.cpp RegisterBankEmitter.cpp RegisterInfoEmitter.cpp - RISCVTargetDefEmitter.cpp SearchableTableEmitter.cpp SubtargetEmitter.cpp - TableGen.cpp - VTEmitter.cpp WebAssemblyDisassemblerEmitter.cpp X86InstrMappingEmitter.cpp X86DisassemblerTables.cpp @@ -79,6 +72,8 @@ add_tablegen(llvm-tblgen LLVM $ $ + PARTIAL_SOURCES_INTENDED + DEPENDS intrinsics_gen # via llvm-min-tablegen ) diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp new file mode 100644 index 0000000000000..79fce5c555f6e --- /dev/null +++ b/llvm/utils/TableGen/llvm-min-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-min-tblgen.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp new file mode 100644 index 0000000000000..a38382472a992 --- /dev/null +++ b/llvm/utils/TableGen/llvm-tblgen.cpp @@ -0,0 +1,18 @@ +//===- llvm-tblgen.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the main function for LLVM's TableGen. +// +//===----------------------------------------------------------------------===// + +#include "Basic/TableGen.h" + +/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen. +/// The indirection to tblgen_main exists to ensure that the static variables +/// for the llvm::cl:: mechanism are linked into both executables. +int main(int argc, char **argv) { return tblgen_main(argc, argv); } From 67ff11ea5b2d2d51fa634361dd88c6dc9429706a Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 3 Jan 2025 16:43:39 +0800 Subject: [PATCH 057/480] [LoongArch] Avoid scheduling tls-desc code sequence in large code model (#121541) --- llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp | 11 ++++++++++- .../CodeGen/LoongArch/psabi-restricted-scheduling.ll | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 7d0e4f9d58a16..54aeda2836400 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -406,6 +406,11 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // lu32i.d $a1, %ie64_pc_lo20(s) // lu52i.d $a1, $a1, %ie64_pc_hi12(s) // + // * pcalau12i $a0, %desc_pc_hi20(s) + // addi.d $a1, $zero, %desc_pc_lo12(s) + // lu32i.d $a1, %desc64_pc_lo20(s) + // lu52i.d $a1, $a1, %desc64_pc_hi12(s) + // // For simplicity, only pcalau12i and lu52i.d are marked as scheduling // boundaries, and the instructions between them are guaranteed to be // ordered according to data dependencies. @@ -430,12 +435,16 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO && MO2 == LoongArchII::MO_IE_PC64_LO) return true; + if (MO0 == LoongArchII::MO_DESC_PC_HI && + MO1 == LoongArchII::MO_DESC_PC_LO && + MO2 == LoongArchII::MO_DESC64_PC_LO) + return true; break; } case LoongArch::LU52I_D: { auto MO = MI.getOperand(2).getTargetFlags(); if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI || - MO == LoongArchII::MO_IE_PC64_HI) + MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI) return true; break; } diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll index 1773b8e014997..3390f7fe14ae6 100644 --- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll @@ -252,8 +252,8 @@ define void @baz() nounwind { ; LARGEDESC_SCH: # %bb.0: ; LARGEDESC_SCH-NEXT: addi.d $sp, $sp, -16 ; LARGEDESC_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LARGEDESC_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) ; LARGEDESC_SCH-NEXT: pcalau12i $a0, %desc_pc_hi20(gd) +; LARGEDESC_SCH-NEXT: addi.d $a1, $zero, %desc_pc_lo12(gd) ; LARGEDESC_SCH-NEXT: lu32i.d $a1, %desc64_pc_lo20(gd) ; LARGEDESC_SCH-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(gd) ; LARGEDESC_SCH-NEXT: add.d $a0, $a0, $a1 From 8b23ebb498bc67f03571b1d429771b28868b8932 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 03:55:58 -0500 Subject: [PATCH 058/480] [AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 (#121510) V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in https://github.com/llvm/llvm-project/pull/113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test --- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 +- llvm/test/CodeGen/AMDGPU/fmax3.ll | 134 ++++++++++++ llvm/test/CodeGen/AMDGPU/fmin3.ll | 200 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx12_asm_vop3.s | 144 +++++++------ llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s | 8 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s | 168 +++++++-------- llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s | 152 ++++++------- .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt | 162 +++++++++++--- .../AMDGPU/gfx12_dasm_vop3_dpp16.txt | 200 +++++++++++++++--- .../AMDGPU/gfx12_dasm_vop3_dpp8.txt | 200 +++++++++++++++--- 10 files changed, 1056 insertions(+), 316 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 22e457674c07a..d00c810859e3b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3; defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">; defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">; -defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">; -defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">; +defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">; +defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">; defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>; defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>; defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>; diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 4b3f0dbbaea98..fbcdbed338e60 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: @@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_0_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_1_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_0_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_1_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX11-NEXT: v_pk_max_f16 v0, v2, v0 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_fmax3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 38b712e044df9..269fd52df5c49 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: @@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f16 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX11-NEXT: v_pk_min_f16 v0, v2, v0 ; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_fmin3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) @@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index 0309b2e8e517e..5674d26327201 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -3164,50 +3164,62 @@ v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp // GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] -v_max3_num_f16 v5, v1, v2, s3 -// GFX12: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +v_max3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] -v_max3_num_f16 v5, v255, s2, s105 -// GFX12: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +v_max3_num_f16 v5.l, v255.l, s2, s105 +// GFX12: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] -v_max3_num_f16 v5, s1, v255, exec_hi -// GFX12: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +v_max3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX12: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] -v_max3_num_f16 v5, s105, s105, exec_lo -// GFX12: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +v_max3_num_f16 v5.l, s105, s105, exec_lo +// GFX12: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] -v_max3_num_f16 v5, vcc_lo, ttmp15, v3 -// GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX12: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] -v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX12: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] -v_max3_num_f16 v5, m0, 0.5, m0 -// GFX12: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +v_max3_num_f16 v5.l, m0, 0.5, m0 +// GFX12: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] -v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi -// GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX12: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] -v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] -// GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX12: v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] -v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] -// GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX12: v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] -// GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX12: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] -v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] -// GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX12: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] -v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] -// GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX12: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] -v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp -// GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v5.l, v255.h, s2, s105 +// GFX12: v_max3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2c,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_num_f16 v5.l, s1, v255.h, exec_hi +// GFX12: v_max3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp +// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] v_max3_num_f32 v5, v1, v2, s3 // GFX12: v_max3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00] @@ -4142,50 +4154,62 @@ v_med3_u32 v5, src_scc, vcc_lo, -1 v_med3_u32 v255, 0xaf123456, vcc_hi, null // GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] -v_min3_num_f16 v5, v1, v2, s3 -// GFX12: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +v_min3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_num_f16 v5.l, v255.l, s2, s105 +// GFX12: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX12: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_num_f16 v5.l, s105, s105, exec_lo +// GFX12: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX12: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] -v_min3_num_f16 v5, v255, s2, s105 -// GFX12: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v5, s1, v255, exec_hi -// GFX12: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX12: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] -v_min3_num_f16 v5, s105, s105, exec_lo -// GFX12: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +v_min3_num_f16 v5.l, m0, 0.5, m0 +// GFX12: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] -v_min3_num_f16 v5, vcc_lo, ttmp15, v3 -// GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX12: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] -v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX12: v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] -v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX12: v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v5, m0, 0.5, m0 -// GFX12: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX12: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] -v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi -// GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX12: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] -v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] -// GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX12: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] -v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] -// GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] -// GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +v_min3_num_f16 v5.l, v255.h, s2, s105 +// GFX12: v_min3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2b,0xd6,0xff,0x05,0xa4,0x01] -v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] -// GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +v_min3_num_f16 v5.l, s1, v255.h, exec_hi +// GFX12: v_min3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0x01,0xfe,0xff,0x01] -v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] -// GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp -// GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp +// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] v_min3_num_f32 v5, v1, v2, s3 // GFX12: v_min3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s index 59cb1a479450f..ee4561fad367c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s @@ -6,11 +6,11 @@ v_min3_f32 v5, v1, v2, v3 v_max3_f32 v5, v1, v2, v3 // GFX12: v_max3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x04] -v_min3_f16 v5, v1, v2, v3 -// GFX12: v_min3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04] +v_min3_f16 v5.l, v1.l, v2.l, v3.l +// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04] -v_max3_f16 v5, v1, v2, v3 -// GFX12: v_max3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04] +v_max3_f16 v5.l, v1.l, v2.l, v3.l +// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04] v_med3_f32 v5, v1, v2, v3 // GFX12: v_med3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index b769324d5412f..0fa344f7e73a3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -2480,53 +2480,53 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bou v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] -v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] +v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -3515,53 +3515,53 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x21,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] -v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] +v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -5302,20 +5302,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4d,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] @@ -5392,20 +5392,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index f76dd26623144..657663f4353ba 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -1545,47 +1545,47 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -2340,47 +2340,47 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x21,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -3571,20 +3571,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4d,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] @@ -3661,20 +3661,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] -v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index 4c2060ad44b8a..58696613e852f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -3509,49 +3509,100 @@ # GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] 0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00 -# GFX12: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01 -# GFX12: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01 -# GFX12: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01 -# GFX12: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX12: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01 -# GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43 -# GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43] 0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23 -# GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W32-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W32-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W64-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] +# W64-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23] 0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 -# GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_max3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00] @@ -4886,49 +4937,100 @@ # GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] 0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00 -# GFX12: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01 -# GFX12: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01 -# GFX12: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01 -# GFX12: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX12: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01 -# GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43 -# GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W32-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] +# W64-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43] 0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23 -# GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W32-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W32-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W64-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] +# W64-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23] 0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 -# GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_min3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index c64fe39d32558..83370defe6349 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -5199,49 +5199,119 @@ # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01 -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] 0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] 0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30 -# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] + 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -5964,49 +6034,119 @@ # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01 -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] 0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] 0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30 -# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13] + 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index 9ed20c70c17a2..2a25e1eefae49 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -3240,49 +3240,119 @@ # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] 0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05 -# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] 0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00 -# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] + +0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05 +# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] + 0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -3981,49 +4051,119 @@ # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] 0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05 -# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] 0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00 -# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05] + +0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05 +# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05] + 0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] From 9f6a1ddb43133328c90edfa29ccd4c714b289cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 3 Jan 2025 09:09:23 +0000 Subject: [PATCH 059/480] [mlir][tensor] Introduce `FoldTensorCastUnPackOp` (#121393) This patch specializes `FoldTensorCastProducerOp` for `tensor::UnPackOp` by introducing a dedicated pattern: `FoldTensorCastUnPackOp`. This mirrors a similar update made for `tensor::PackOp` in #114559. Below is the updated rationale tailored to `tensor::UnPackOp`. ISSUE DESCRIPTION Currently, `FoldTensorCastProducerOp` incorrectly folds the following: ```mlir %cast = tensor.cast %dest : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> // Note: `%c8` and `?`. %unpack = tensor.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %res : tensor<1x1x?x1xi32> -> tensor<7x?xi32> ``` as: ```mlir // Note: `%c8` and `8`. %unpack = tensor.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %res : tensor<1x1x8x1xi32> -> tensor<7x?xi32> ``` This triggers an Op verification failure because the folder does not update the inner tile sizes in the unpack Op. This patch addresses the issue by ensuring proper handling of inner tile sizes. ADDITIONAL CHANGES * invalid.mlir: Fixed a typo. * TensorOps.cpp: * Removed unnecessary `(void)tileSize`. * Added comments following the discussion in PR #115772. * Made minor updates to `FoldTensorCastPackOp` for consistency with the newly introduced `FoldTensorCastUnPackOp`. * Tensor/canonicalize.mlir: Ensured consistent usage of `test_attr` (e.g., replaced mixed use of `test_attr` and `some_attr`). --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 123 +++++++++++++++++---- mlir/test/Dialect/Tensor/canonicalize.mlir | 25 ++++- mlir/test/Dialect/Tensor/invalid.mlir | 2 +- 3 files changed, 123 insertions(+), 27 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index f79c774ceb3e9..24a1d55315319 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -4795,6 +4795,44 @@ static SmallVector getNewOperands(DestinationStyleOpInterface op, return newOperands; } +// Given the (potentially) updated packed type, `newPackedTy`, generates an +// updated mixed-tile-sizes attribute. A tile size is updated only +// when: +// * a dim from newPackedTy is static, and +// * the corresponding size from mixedTiles is still dynamic. +// Otherwise, the original tile size is preserved. +// Note - packed-type-dim and mixed-tile-size should always match! +static SmallVector +getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy, + SmallVector mixedTiles) { + SmallVector newMixedTileSizes; + for (auto it : llvm::zip(cast(newPackedTy) + .getShape() + .take_back(mixedTiles.size()), + mixedTiles)) { + int64_t shape = std::get<0>(it); + if (shape == ShapedType::kDynamic) { + newMixedTileSizes.push_back(std::get<1>(it)); + continue; + } + + // If the current result dim is static, update the dynamic mixed-size + // (provided the original value is dynamic). + OpFoldResult tile = std::get<1>(it); + if (Attribute attr = llvm::dyn_cast_if_present(tile)) { + // Already a constant + newMixedTileSizes.push_back(tile); + } else { + assert(getConstantIntValue(tile).value() == shape && + "tile size and dim size don't match!"); + newMixedTileSizes.push_back( + (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); + } + } + + return newMixedTileSizes; +} + /// Folds a tensor.cast op into a consuming tensor::PackOp op if the /// `tensor.cast` has source that is more static than the consuming op. /// @@ -4821,31 +4859,13 @@ struct FoldTensorCastPackOp : public OpRewritePattern { SmallVector newOperands = getNewOperands(op, newResultTypes); // Get the updated mixed-tile-sizes attribute. - SmallVector newMixedTileSizes; - for (auto it : llvm::zip(cast(newResultTypes[0]) - .getShape() - .take_back(op.getMixedTiles().size()), - op.getMixedTiles())) { - int64_t shape = std::get<0>(it); - if (shape == ShapedType::kDynamic) { - newMixedTileSizes.push_back(std::get<1>(it)); - continue; - } - - if (Attribute attr = - llvm::dyn_cast_if_present(std::get<1>(it))) { - // Already a constant - newMixedTileSizes.push_back(std::get<1>(it)); - } else { - int64_t tileSize = getConstantIntValue(std::get<1>(it)).value(); - assert(tileSize == shape && "tile size and dim size don't match!"); - (void)tileSize; - newMixedTileSizes.push_back( - (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); - } - } + SmallVector newMixedTileSizes = + getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles()); // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. PackOp newOp = rewriter.create( op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(), newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm()); @@ -4865,6 +4885,59 @@ struct FoldTensorCastPackOp : public OpRewritePattern { } }; +/// Folds a tensor.cast op into a consuming tensor::UnPackOp op if the +/// `tensor.cast` has source that is more static than the consuming op. +/// +/// Example: +/// ```mlir +/// %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> +/// %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32> +/// ``` +/// +/// folds into: +/// +/// ```mlir +/// %2 = tensor.unpack %0 ... tensor<1x1x8x1xi32> -> tensor<7x?xi32> +/// ``` +struct FoldTensorCastUnPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(UnPackOp op, + PatternRewriter &rewriter) const override { + if (!foldTensorCastPrecondition(op)) + return failure(); + + SmallVector newResultTypes(op->getResultTypes()); + SmallVector newOperands = getNewOperands(op, newResultTypes); + Value sourceTensor = newOperands[0]; + + // Get the updated mixed-tile-sizes attribute. + SmallVector newMixedTileSizes = getNewMixedTileSizes( + rewriter, sourceTensor.getType(), op.getMixedTiles()); + + // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. + UnPackOp newOp = rewriter.create( + op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(), + newMixedTileSizes, op.getOuterDimsPerm()); + newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); + + // Replace op. + Value oldResult = op.getResult(); + Value newResult = newOp.getResult(); + Value replacement = (newResult.getType() != oldResult.getType()) + ? rewriter.create( + op->getLoc(), oldResult.getType(), newResult) + : newResult; + + rewriter.replaceOp(op, {replacement}); + + return success(); + } +}; + /// Folds a tensor.cast op into a consuming DestinationStyleOpInterface op if /// the `tensor.cast` has source that is more static than the consuming op. /// @@ -4890,7 +4963,8 @@ struct FoldTensorCastProducerOp PatternRewriter &rewriter) const override { // Reject tensor::PackOp - there's dedicated pattern for that instead. - if (!foldTensorCastPrecondition(op) || dyn_cast(*op)) + if (!foldTensorCastPrecondition(op) || + isa(*op)) return failure(); SmallVector newResultTypes(op->getResultTypes()); @@ -4923,6 +4997,7 @@ struct FoldTensorCastProducerOp void TensorDialect::getCanonicalizationPatterns( RewritePatternSet &results) const { results.add(getContext()); + results.add(getContext()); results.add(getContext()); } diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index e8fc4ce834e18..01d14871072cd 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -2786,6 +2786,7 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x %0:2 = test.destination_style_op ins(%cast : tensor) outs(%cast_0 : tensor) -> tensor, index return %0#1 : index } + // ----- // CHECK-LABEL: func.func @fold_cast_pack_dynamic_tile_size @@ -2794,7 +2795,7 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x // CHECK-SAME: %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> { // CHECK: %[[PACK:.*]] = tensor.pack %[[SRC]] padding_value(%[[PAD]] : i32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] -// CHECK-SAME: some_attr +// CHECK-SAME: test_attr // CHECK-SAME: : tensor<7x?xi32> -> tensor<1x1x8x1xi32> // CHECK: return %[[PACK]] : tensor<1x1x8x1xi32> func.func @fold_cast_pack_dynamic_tile_size( @@ -2807,13 +2808,33 @@ func.func @fold_cast_pack_dynamic_tile_size( %pack = tensor.pack %src padding_value(%pad : i32) inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] - into %cast {some_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> + into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32> return %res : tensor<1x1x8x1xi32> } // ----- +// CHECK-LABEL: func.func @fold_cast_unpack_dynamic_tile_size( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x1x8x1xi32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> { +// CHECK: %[[RES:.*]] = tensor.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32> +// CHECK: return %[[RES]] : tensor<7x?xi32> +func.func @fold_cast_unpack_dynamic_tile_size( + %src: tensor<1x1x8x1xi32>, + %res: tensor<7x?xi32>) -> tensor<7x?xi32> { + + %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> + %c8 = arith.constant 8 : index + %unpack = tensor.unpack %cast + inner_dims_pos = [0, 1] + inner_tiles = [%c8, 1] + into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> + return %unpack : tensor<7x?xi32> +} + +// ----- + // CHECK-LABEL: func.func @pack_dont_drop_attributes( // CHECK: tensor.pack {{.*}} {test_attr} func.func @pack_dont_drop_attributes(%arg0: tensor, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> { diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir index 83cb4b9d4ab24..1de3e281bc462 100644 --- a/mlir/test/Dialect/Tensor/invalid.mlir +++ b/mlir/test/Dialect/Tensor/invalid.mlir @@ -699,7 +699,7 @@ func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor // ----- -func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { +func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} %0 = tensor.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> From 258256821753504836f797e38d83a8e88daa424d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 3 Jan 2025 09:11:38 +0000 Subject: [PATCH 060/480] [mlir] Add missing patterns to `linalg.decompose_pack_unpack` TD Op (#121400) This PR is a follow-up to #116373 and #116439, where a Transform Dialect (TD) operation was introduced to collect patterns for decomposing tensor.pack. The second patch renamed the patterns and the TD Op. Originally, adding patterns for `tensor.unpack` was marked as a TODO, which this PR addresses. No new tests are introduced in this PR. Instead, existing tests from: * "decompose-tensor-unpack.mlir" are reused. To achieve this: * The test is updated to use the TD operation `transform.apply_patterns.linalg.decompose_pack_unpack` instead of the flag `--test-linalg-transform-patterns="test-decompose-tensor-unpack"`, avoiding artificial tests created solely for the TD Op. * The TD sequence is saved to a new file, "decompose_unpack.mlir", and preloaded using the option. --- mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 2 +- .../Dialect/Linalg/decompose-tensor-unpack-tile.mlir | 5 ++++- .../test/Dialect/Linalg/decompose-tensor-unpack.mlir | 4 +++- mlir/test/Dialect/Linalg/td/decompose-unpack.mlir | 12 ++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Dialect/Linalg/td/decompose-unpack.mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 60cf897b00de3..50593b08ad74b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -1656,8 +1656,8 @@ void linalg::populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, } void linalg::populateDecomposePackUnpackPatterns(RewritePatternSet &patterns) { - // TODO: Add and test patterns for tensor.unpack patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } void linalg::populateDecomposePadPatterns(RewritePatternSet &patterns) { diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir index 6d9709caf7093..0dbdf470bbfc9 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-opt -split-input-file --transform-interpreter --canonicalize --test-linalg-transform-patterns="test-decompose-tensor-unpack" %s | FileCheck %s +// RUN: mlir-opt -split-input-file -transform-interpreter --canonicalize \ +// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-unpack.mlir' \ +// RUN: -transform-interpreter=entry-point=decompose_unpack \ +// RUN: -transform-interpreter %s | FileCheck %s func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128x64xf32>) -> tensor<1x1x128x64xf32> { %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32> diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir index bd60504f53345..ba1f214952562 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir @@ -1,4 +1,6 @@ -// RUN: mlir-opt -split-input-file --test-linalg-transform-patterns="test-decompose-tensor-unpack" %s | FileCheck %s +// RUN: mlir-opt -split-input-file \ +// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-unpack.mlir' \ +// RUN: -transform-interpreter=entry-point=decompose_unpack %s | FileCheck %s func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32> { %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32> diff --git a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir new file mode 100644 index 0000000000000..11243634262e0 --- /dev/null +++ b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir @@ -0,0 +1,12 @@ +module @transforms attributes { transform.with_named_sequence } { + transform.named_sequence @decompose_unpack(%module: !transform.any_op {transform.readonly}) { + %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op + + %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %1 { + transform.apply_patterns.linalg.decompose_pack_unpack + } : !transform.any_op + + transform.yield + } +} From 2fae5bdea7c2016d4086aa7ecf3c5d0592ce95c8 Mon Sep 17 00:00:00 2001 From: Shao-Ce SUN Date: Fri, 3 Jan 2025 17:25:42 +0800 Subject: [PATCH 061/480] [RISCV] Add support of Sdext,Sdtrig extentions (#120936) `Sdext` and `Sdtrig` are RISC-V extensions related to debugging. The full specification can be found at https://github.com/riscv/riscv-debug-spec/releases/download/1.0.0-rc4/riscv-debug-specification.pdf --- .../Driver/print-supported-extensions-riscv.c | 2 + .../test/Preprocessor/riscv-target-features.c | 18 ++++++++ llvm/docs/RISCVUsage.rst | 3 ++ llvm/docs/ReleaseNotes.md | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 4 ++ llvm/lib/Target/RISCV/RISCVSystemOperands.td | 3 ++ llvm/test/CodeGen/RISCV/attributes.ll | 4 ++ llvm/test/CodeGen/RISCV/features-info.ll | 2 + llvm/test/MC/RISCV/attribute-arch.s | 6 +++ llvm/test/MC/RISCV/machine-csr-names.s | 42 +++++++++++++++++++ .../TargetParser/RISCVISAInfoTest.cpp | 2 + 11 files changed, 87 insertions(+) diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 395501eb85ccc..f08ff00c9cbeb 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -185,6 +185,8 @@ // CHECK-NEXT: zalasr 0.1 'Zalasr' (Load-Acquire and Store-Release Instructions) // CHECK-NEXT: zvbc32e 0.7 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements) // CHECK-NEXT: zvkgs 0.7 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography) +// CHECK-NEXT: sdext 1.0 'Sdext' (External debugger) +// CHECK-NEXT: sdtrig 1.0 'Sdtrig' (Debugger triggers) // CHECK-NEXT: smctr 1.0 'Smctr' (Control Transfer Records Machine Level) // CHECK-NEXT: ssctr 1.0 'Ssctr' (Control Transfer Records Supervisor Level) // CHECK-NEXT: svukte 0.3 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses) diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index e376821a5517c..c219771135275 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -182,6 +182,8 @@ // Experimental extensions +// CHECK-NOT: __riscv_sdext{{.*$}} +// CHECK-NOT: __riscv_sdtrig{{.*$}} // CHECK-NOT: __riscv_smctr{{.*$}} // CHECK-NOT: __riscv_smmpm{{.*$}} // CHECK-NOT: __riscv_smnpm{{.*$}} @@ -1795,6 +1797,22 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-SUPM-EXT %s // CHECK-SUPM-EXT: __riscv_supm 1000000{{$}} +// RUN: %clang --target=riscv32 -menable-experimental-extensions \ +// RUN: -march=rv32i_sdext1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDEXT-EXT %s +// RUN: %clang --target=riscv64 -menable-experimental-extensions \ +// RUN: -march=rv64i_sdext1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDEXT-EXT %s +// CHECK-SDEXT-EXT: __riscv_sdext 1000000{{$}} + +// RUN: %clang --target=riscv32 -menable-experimental-extensions \ +// RUN: -march=rv32i_sdtrig1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDTRIG-EXT %s +// RUN: %clang --target=riscv64 -menable-experimental-extensions \ +// RUN: -march=rv64i_sdtrig1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SDTRIG-EXT %s +// CHECK-SDTRIG-EXT: __riscv_sdtrig 1000000{{$}} + // RUN: %clang --target=riscv32 -menable-experimental-extensions \ // RUN: -march=rv32i_smctr1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-SMCTR-EXT %s diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index eaaad6c516818..835b910ec452d 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -326,6 +326,9 @@ The primary goal of experimental support is to assist in the process of ratifica ``experimental-zvbc32e``, ``experimental-zvkgs`` LLVM implements the `0.7 release specification `__. +``experimental-sdext``, ``experimental-sdtrig`` + LLVM implements the `1.0-rc4 specification `__. + ``experimental-smctr``, ``experimental-ssctr`` LLVM implements the `1.0-rc3 specification `__. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index be62a7e8696b4..11ee9864e5174 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -232,6 +232,7 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqcicli` (Conditional Load Immediate) extension. +* Added ``Sdext`` and ``Sdtrig`` extensions. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 3885b95a8937a..0074be35798ac 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -844,6 +844,10 @@ def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">, // Supervisor extensions +def FeatureStdExtSdext : RISCVExperimentalExtension<1, 0, "External debugger">; + +def FeatureStdExtSdtrig : RISCVExperimentalExtension<1, 0, "Debugger triggers">; + def FeatureStdExtShgatpa : RISCVExtension<1, 0, "SvNNx4 mode supported for all modes supported by satp, as well as Bare">; diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td index d85b4a9cf77b3..72275daa1b8d1 100644 --- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td +++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td @@ -323,7 +323,10 @@ def : SysReg<"tselect", 0x7A0>; def : SysReg<"tdata1", 0x7A1>; def : SysReg<"tdata2", 0x7A2>; def : SysReg<"tdata3", 0x7A3>; +def : SysReg<"tinfo", 0x7A4>; +def : SysReg<"tcontrol", 0x7A5>; def : SysReg<"mcontext", 0x7A8>; +def : SysReg<"mscontext", 0x7AA>; //===----------------------------------------------------------------------===// // Debug Mode Registers diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index bcf945470d85b..7e55e0590ec59 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -296,6 +296,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+supm %s -o - | FileCheck --check-prefix=RV64SUPM %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smctr %s -o - | FileCheck --check-prefix=RV64SMCTR %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssctr %s -o - | FileCheck --check-prefix=RV64SSCTR %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdext %s -o - | FileCheck --check-prefix=RV64SDEXT %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdtrig %s -o - | FileCheck --check-prefix=RV64SDTRIG %s ; Tests for profile features. ; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s @@ -605,6 +607,8 @@ ; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0" ; RV64SMCTR: .attribute 5, "rv64i2p1_smctr1p0_sscsrind1p0" ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0" +; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0" +; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0" ; RVI20U32: .attribute 5, "rv32i2p1" ; RVI20U64: .attribute 5, "rv64i2p1" diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 99db90c5fa925..70fbda47a14a1 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -15,6 +15,8 @@ ; CHECK: e - 'E' (Embedded Instruction Set with 16 GPRs). ; CHECK: experimental - Experimental intrinsics. ; CHECK: experimental-rvm23u32 - RISC-V experimental-rvm23u32 profile. +; CHECK: experimental-sdext - 'Sdext' (External debugger). +; CHECK: experimental-sdtrig - 'Sdtrig' (Debugger triggers). ; CHECK: experimental-smctr - 'Smctr' (Control Transfer Records Machine Level). ; CHECK: experimental-ssctr - 'Ssctr' (Control Transfer Records Supervisor Level). ; CHECK: experimental-svukte - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses). diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 6ffaa62d50dcf..4e77a53bd706c 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -467,3 +467,9 @@ .attribute arch, "rv32i_ssctr1p0" # CHECK: attribute 5, "rv32i2p1_sscsrind1p0_ssctr1p0" + +.attribute arch, "rv32i_sdext1p0" +# CHECK: attribute 5, "rv32i2p1_sdext1p0" + +.attribute arch, "rv32i_sdtrig1p0" +# CHECK: attribute 5, "rv32i2p1_sdtrig1p0" diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s index 07b948a78e6c2..ba2a79f1f6aa1 100644 --- a/llvm/test/MC/RISCV/machine-csr-names.s +++ b/llvm/test/MC/RISCV/machine-csr-names.s @@ -1419,6 +1419,34 @@ csrrs t1, tdata3, zero # uimm12 csrrs t2, 0x7A3, zero +# tinfo +# name +# CHECK-INST: csrrs t1, tinfo, zero +# CHECK-ENC: encoding: [0x73,0x23,0x40,0x7a] +# CHECK-INST-ALIAS: csrr t1, tinfo +# uimm12 +# CHECK-INST: csrrs t2, tinfo, zero +# CHECK-ENC: encoding: [0xf3,0x23,0x40,0x7a] +# CHECK-INST-ALIAS: csrr t2, tinfo +# name +csrrs t1, tinfo, zero +# uimm12 +csrrs t2, 0x7A4, zero + +# tcontrol +# name +# CHECK-INST: csrrs t1, tcontrol, zero +# CHECK-ENC: encoding: [0x73,0x23,0x50,0x7a] +# CHECK-INST-ALIAS: csrr t1, tcontrol +# uimm12 +# CHECK-INST: csrrs t2, tcontrol, zero +# CHECK-ENC: encoding: [0xf3,0x23,0x50,0x7a] +# CHECK-INST-ALIAS: csrr t2, tcontrol +# name +csrrs t1, tcontrol, zero +# uimm12 +csrrs t2, 0x7A5, zero + # mcontext # name # CHECK-INST: csrrs t1, mcontext, zero @@ -1433,6 +1461,20 @@ csrrs t1, mcontext, zero # uimm12 csrrs t2, 0x7A8, zero +# mscontext +# name +# CHECK-INST: csrrs t1, mscontext, zero +# CHECK-ENC: encoding: [0x73,0x23,0xa0,0x7a] +# CHECK-INST-ALIAS: csrr t1, mscontext +# uimm12 +# CHECK-INST: csrrs t2, mscontext, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xa0,0x7a] +# CHECK-INST-ALIAS: csrr t2, mscontext +# name +csrrs t1, mscontext, zero +# uimm12 +csrrs t2, 0x7AA, zero + ####################### # Debug Mode Registers ######################## diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index f631f26cf482e..3ea5afce56fa3 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1110,6 +1110,8 @@ Experimental extensions zalasr 0.1 zvbc32e 0.7 zvkgs 0.7 + sdext 1.0 + sdtrig 1.0 smctr 1.0 ssctr 1.0 svukte 0.3 From e4e47cef55886036651ff7f0dfd8475d3a158a4c Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 3 Jan 2025 10:35:01 +0100 Subject: [PATCH 062/480] [bazel] Fix the broken llvm-tblgen build for 27f30029741ecf023baece7b3dde1ff9011ffefc --- .../llvm-project-overlay/llvm/BUILD.bazel | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 36e266d26fc3d..18ac78174856b 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -645,18 +645,20 @@ cc_binary( cc_binary( name = "llvm-min-tblgen", srcs = [ - "utils/TableGen/ARMTargetDefEmitter.cpp", - "utils/TableGen/Attributes.cpp", + "utils/TableGen/Basic/ARMTargetDefEmitter.cpp", + "utils/TableGen/Basic/Attributes.cpp", "utils/TableGen/Basic/CodeGenIntrinsics.cpp", "utils/TableGen/Basic/CodeGenIntrinsics.h", "utils/TableGen/Basic/SDNodeProperties.cpp", "utils/TableGen/Basic/SDNodeProperties.h", + "utils/TableGen/Basic/TableGen.h", + "utils/TableGen/Basic/TableGen.cpp", "utils/TableGen/Basic/SequenceToOffsetTable.h", - "utils/TableGen/DirectiveEmitter.cpp", - "utils/TableGen/IntrinsicEmitter.cpp", - "utils/TableGen/RISCVTargetDefEmitter.cpp", - "utils/TableGen/TableGen.cpp", - "utils/TableGen/VTEmitter.cpp", + "utils/TableGen/Basic/DirectiveEmitter.cpp", + "utils/TableGen/Basic/IntrinsicEmitter.cpp", + "utils/TableGen/Basic/RISCVTargetDefEmitter.cpp", + "utils/TableGen/Basic/VTEmitter.cpp", + "utils/TableGen/llvm-min-tblgen.cpp", ], copts = llvm_copts, stamp = 0, @@ -715,7 +717,10 @@ cc_binary( # regular dependency. "include/llvm/MC/*.h", ], - exclude = ["utils/TableGen/Common/GlobalISel/CodeExpander.cpp"], + exclude = [ + "utils/TableGen/Common/GlobalISel/CodeExpander.cpp", + "utils/TableGen/llvm-min-tblgen.cpp", + ], ) + [ "include/llvm/TargetParser/SubtargetFeature.h", ], From 366e836051adf5eb352b00828541197729e061e6 Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Thu, 2 Jan 2025 22:43:22 -0800 Subject: [PATCH 063/480] [RISCV][NFC] precommit test for fcmp with f16 --- llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll | 254 +------ .../Analysis/CostModel/RISCV/rvv-fcmp-f16.ll | 677 ++++++++++++++++++ 2 files changed, 678 insertions(+), 253 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll index 56f9e18c6c5a0..d1b230c35ff2d 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll @@ -875,15 +875,6 @@ define void @icmp_sle() { define void @fcmp_oeq() { ; CHECK-LABEL: 'fcmp_oeq' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oeq <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oeq <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oeq <8 x float> undef, undef @@ -902,16 +893,7 @@ define void @fcmp_oeq() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oeq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp oeq <2 x half> undef, undef - %v4f16 = fcmp oeq <4 x half> undef, undef - %v8f16 = fcmp oeq <8 x half> undef, undef - %v16f16 = fcmp oeq <16 x half> undef, undef - %nxv1f16 = fcmp oeq undef, undef - %nxv2f16 = fcmp oeq undef, undef - %nxv4f16 = fcmp oeq undef, undef - %nxv8f16 = fcmp oeq undef, undef - %nxv16f16 = fcmp oeq undef, undef %v2f32 = fcmp oeq <2 x float> undef, undef %v4f32 = fcmp oeq <4 x float> undef, undef @@ -938,15 +920,6 @@ define void @fcmp_oeq() { define void @fcmp_one() { ; CHECK-LABEL: 'fcmp_one' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp one <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp one <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp one <8 x float> undef, undef @@ -965,16 +938,7 @@ define void @fcmp_one() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp one undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp one <2 x half> undef, undef - %v4f16 = fcmp one <4 x half> undef, undef - %v8f16 = fcmp one <8 x half> undef, undef - %v16f16 = fcmp one <16 x half> undef, undef - %nxv1f16 = fcmp one undef, undef - %nxv2f16 = fcmp one undef, undef - %nxv4f16 = fcmp one undef, undef - %nxv8f16 = fcmp one undef, undef - %nxv16f16 = fcmp one undef, undef %v2f32 = fcmp one <2 x float> undef, undef %v4f32 = fcmp one <4 x float> undef, undef @@ -1001,15 +965,6 @@ define void @fcmp_one() { define void @fcmp_olt() { ; CHECK-LABEL: 'fcmp_olt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef @@ -1028,16 +983,7 @@ define void @fcmp_olt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp olt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp olt <2 x half> undef, undef - %v4f16 = fcmp olt <4 x half> undef, undef - %v8f16 = fcmp olt <8 x half> undef, undef - %v16f16 = fcmp olt <16 x half> undef, undef - %nxv1f16 = fcmp olt undef, undef - %nxv2f16 = fcmp olt undef, undef - %nxv4f16 = fcmp olt undef, undef - %nxv8f16 = fcmp olt undef, undef - %nxv16f16 = fcmp olt undef, undef %v2f32 = fcmp olt <2 x float> undef, undef %v4f32 = fcmp olt <4 x float> undef, undef @@ -1064,15 +1010,6 @@ define void @fcmp_olt() { define void @fcmp_ole() { ; CHECK-LABEL: 'fcmp_ole' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ole <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ole <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ole <8 x float> undef, undef @@ -1091,16 +1028,7 @@ define void @fcmp_ole() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ole undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ole <2 x half> undef, undef - %v4f16 = fcmp ole <4 x half> undef, undef - %v8f16 = fcmp ole <8 x half> undef, undef - %v16f16 = fcmp ole <16 x half> undef, undef - %nxv1f16 = fcmp ole undef, undef - %nxv2f16 = fcmp ole undef, undef - %nxv4f16 = fcmp ole undef, undef - %nxv8f16 = fcmp ole undef, undef - %nxv16f16 = fcmp ole undef, undef %v2f32 = fcmp ole <2 x float> undef, undef %v4f32 = fcmp ole <4 x float> undef, undef @@ -1127,15 +1055,6 @@ define void @fcmp_ole() { define void @fcmp_ogt() { ; CHECK-LABEL: 'fcmp_ogt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ogt <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ogt <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ogt <8 x float> undef, undef @@ -1154,16 +1073,7 @@ define void @fcmp_ogt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ogt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ogt <2 x half> undef, undef - %v4f16 = fcmp ogt <4 x half> undef, undef - %v8f16 = fcmp ogt <8 x half> undef, undef - %v16f16 = fcmp ogt <16 x half> undef, undef - %nxv1f16 = fcmp ogt undef, undef - %nxv2f16 = fcmp ogt undef, undef - %nxv4f16 = fcmp ogt undef, undef - %nxv8f16 = fcmp ogt undef, undef - %nxv16f16 = fcmp ogt undef, undef %v2f32 = fcmp ogt <2 x float> undef, undef %v4f32 = fcmp ogt <4 x float> undef, undef @@ -1190,15 +1100,6 @@ define void @fcmp_ogt() { define void @fcmp_oge() { ; CHECK-LABEL: 'fcmp_oge' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oge <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oge <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oge <8 x float> undef, undef @@ -1217,16 +1118,7 @@ define void @fcmp_oge() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp oge <2 x half> undef, undef - %v4f16 = fcmp oge <4 x half> undef, undef - %v8f16 = fcmp oge <8 x half> undef, undef - %v16f16 = fcmp oge <16 x half> undef, undef - %nxv1f16 = fcmp oge undef, undef - %nxv2f16 = fcmp oge undef, undef - %nxv4f16 = fcmp oge undef, undef - %nxv8f16 = fcmp oge undef, undef - %nxv16f16 = fcmp oge undef, undef %v2f32 = fcmp oge <2 x float> undef, undef %v4f32 = fcmp oge <4 x float> undef, undef @@ -1253,15 +1145,6 @@ define void @fcmp_oge() { define void @fcmp_ueq() { ; CHECK-LABEL: 'fcmp_ueq' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp ueq <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp ueq <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp ueq <8 x float> undef, undef @@ -1280,16 +1163,7 @@ define void @fcmp_ueq() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp ueq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ueq <2 x half> undef, undef - %v4f16 = fcmp ueq <4 x half> undef, undef - %v8f16 = fcmp ueq <8 x half> undef, undef - %v16f16 = fcmp ueq <16 x half> undef, undef - %nxv1f16 = fcmp ueq undef, undef - %nxv2f16 = fcmp ueq undef, undef - %nxv4f16 = fcmp ueq undef, undef - %nxv8f16 = fcmp ueq undef, undef - %nxv16f16 = fcmp ueq undef, undef %v2f32 = fcmp ueq <2 x float> undef, undef %v4f32 = fcmp ueq <4 x float> undef, undef @@ -1316,15 +1190,6 @@ define void @fcmp_ueq() { define void @fcmp_une() { ; CHECK-LABEL: 'fcmp_une' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp une <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp une <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp une <8 x float> undef, undef @@ -1343,16 +1208,7 @@ define void @fcmp_une() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp une undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp une <2 x half> undef, undef - %v4f16 = fcmp une <4 x half> undef, undef - %v8f16 = fcmp une <8 x half> undef, undef - %v16f16 = fcmp une <16 x half> undef, undef - %nxv1f16 = fcmp une undef, undef - %nxv2f16 = fcmp une undef, undef - %nxv4f16 = fcmp une undef, undef - %nxv8f16 = fcmp une undef, undef - %nxv16f16 = fcmp une undef, undef %v2f32 = fcmp une <2 x float> undef, undef %v4f32 = fcmp une <4 x float> undef, undef @@ -1379,15 +1235,6 @@ define void @fcmp_une() { define void @fcmp_ult() { ; CHECK-LABEL: 'fcmp_ult' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ult <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ult <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ult <8 x float> undef, undef @@ -1406,16 +1253,7 @@ define void @fcmp_ult() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ult <2 x half> undef, undef - %v4f16 = fcmp ult <4 x half> undef, undef - %v8f16 = fcmp ult <8 x half> undef, undef - %v16f16 = fcmp ult <16 x half> undef, undef - %nxv1f16 = fcmp ult undef, undef - %nxv2f16 = fcmp ult undef, undef - %nxv4f16 = fcmp ult undef, undef - %nxv8f16 = fcmp ult undef, undef - %nxv16f16 = fcmp ult undef, undef %v2f32 = fcmp ult <2 x float> undef, undef %v4f32 = fcmp ult <4 x float> undef, undef @@ -1442,15 +1280,6 @@ define void @fcmp_ult() { define void @fcmp_ule() { ; CHECK-LABEL: 'fcmp_ule' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ule <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ule <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ule <8 x float> undef, undef @@ -1469,16 +1298,7 @@ define void @fcmp_ule() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ule <2 x half> undef, undef - %v4f16 = fcmp ule <4 x half> undef, undef - %v8f16 = fcmp ule <8 x half> undef, undef - %v16f16 = fcmp ule <16 x half> undef, undef - %nxv1f16 = fcmp ule undef, undef - %nxv2f16 = fcmp ule undef, undef - %nxv4f16 = fcmp ule undef, undef - %nxv8f16 = fcmp ule undef, undef - %nxv16f16 = fcmp ule undef, undef %v2f32 = fcmp ule <2 x float> undef, undef %v4f32 = fcmp ule <4 x float> undef, undef @@ -1505,15 +1325,6 @@ define void @fcmp_ule() { define void @fcmp_ugt() { ; CHECK-LABEL: 'fcmp_ugt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ugt <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ugt <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ugt <8 x float> undef, undef @@ -1532,16 +1343,7 @@ define void @fcmp_ugt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp ugt <2 x half> undef, undef - %v4f16 = fcmp ugt <4 x half> undef, undef - %v8f16 = fcmp ugt <8 x half> undef, undef - %v16f16 = fcmp ugt <16 x half> undef, undef - %nxv1f16 = fcmp ugt undef, undef - %nxv2f16 = fcmp ugt undef, undef - %nxv4f16 = fcmp ugt undef, undef - %nxv8f16 = fcmp ugt undef, undef - %nxv16f16 = fcmp ugt undef, undef %v2f32 = fcmp ugt <2 x float> undef, undef %v4f32 = fcmp ugt <4 x float> undef, undef @@ -1568,15 +1370,6 @@ define void @fcmp_ugt() { define void @fcmp_uge() { ; CHECK-LABEL: 'fcmp_uge' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp uge <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp uge <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp uge <8 x float> undef, undef @@ -1595,16 +1388,7 @@ define void @fcmp_uge() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp uge <2 x half> undef, undef - %v4f16 = fcmp uge <4 x half> undef, undef - %v8f16 = fcmp uge <8 x half> undef, undef - %v16f16 = fcmp uge <16 x half> undef, undef - %nxv1f16 = fcmp uge undef, undef - %nxv2f16 = fcmp uge undef, undef - %nxv4f16 = fcmp uge undef, undef - %nxv8f16 = fcmp uge undef, undef - %nxv16f16 = fcmp uge undef, undef %v2f32 = fcmp uge <2 x float> undef, undef %v4f32 = fcmp uge <4 x float> undef, undef @@ -1631,15 +1415,6 @@ define void @fcmp_uge() { define void @fcmp_true() { ; CHECK-LABEL: 'fcmp_true' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp true <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp true <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp true <8 x float> undef, undef @@ -1658,16 +1433,7 @@ define void @fcmp_true() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp true undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp true <2 x half> undef, undef - %v4f16 = fcmp true <4 x half> undef, undef - %v8f16 = fcmp true <8 x half> undef, undef - %v16f16 = fcmp true <16 x half> undef, undef - %nxv1f16 = fcmp true undef, undef - %nxv2f16 = fcmp true undef, undef - %nxv4f16 = fcmp true undef, undef - %nxv8f16 = fcmp true undef, undef - %nxv16f16 = fcmp true undef, undef %v2f32 = fcmp true <2 x float> undef, undef %v4f32 = fcmp true <4 x float> undef, undef @@ -1694,15 +1460,6 @@ define void @fcmp_true() { define void @fcmp_false() { ; CHECK-LABEL: 'fcmp_false' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp false <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp false <4 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp false <8 x float> undef, undef @@ -1721,16 +1478,7 @@ define void @fcmp_false() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp false undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16 = fcmp false <2 x half> undef, undef - %v4f16 = fcmp false <4 x half> undef, undef - %v8f16 = fcmp false <8 x half> undef, undef - %v16f16 = fcmp false <16 x half> undef, undef - - %nxv1f16 = fcmp false undef, undef - %nxv2f16 = fcmp false undef, undef - %nxv4f16 = fcmp false undef, undef - %nxv8f16 = fcmp false undef, undef - %nxv16f16 = fcmp false undef, undef + %v2f32 = fcmp false <2 x float> undef, undef %v4f32 = fcmp false <4 x float> undef, undef diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll new file mode 100644 index 0000000000000..8396e80ca3e80 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll @@ -0,0 +1,677 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=NOF16 +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFH +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFHMIN + +define void @fcmp_oeq() { +; NOF16-LABEL: 'fcmp_oeq' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oeq undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_oeq' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_oeq' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oeq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp oeq <2 x half> undef, undef + %v4f16 = fcmp oeq <4 x half> undef, undef + %v8f16 = fcmp oeq <8 x half> undef, undef + %v16f16 = fcmp oeq <16 x half> undef, undef + %nxv1f16 = fcmp oeq undef, undef + %nxv2f16 = fcmp oeq undef, undef + %nxv4f16 = fcmp oeq undef, undef + %nxv8f16 = fcmp oeq undef, undef + %nxv16f16 = fcmp oeq undef, undef + ret void +} +define void @fcmp_one() { +; NOF16-LABEL: 'fcmp_one' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp one <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp one <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp one <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp one <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp one undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_one' +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_one' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp one <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp one <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp one <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp one <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp one undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp one <2 x half> undef, undef + %v4f16 = fcmp one <4 x half> undef, undef + %v8f16 = fcmp one <8 x half> undef, undef + %v16f16 = fcmp one <16 x half> undef, undef + %nxv1f16 = fcmp one undef, undef + %nxv2f16 = fcmp one undef, undef + %nxv4f16 = fcmp one undef, undef + %nxv8f16 = fcmp one undef, undef + %nxv16f16 = fcmp one undef, undef + ret void +} +define void @fcmp_olt() { +; NOF16-LABEL: 'fcmp_olt' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp olt undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_olt' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_olt' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp olt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp olt <2 x half> undef, undef + %v4f16 = fcmp olt <4 x half> undef, undef + %v8f16 = fcmp olt <8 x half> undef, undef + %v16f16 = fcmp olt <16 x half> undef, undef + %nxv1f16 = fcmp olt undef, undef + %nxv2f16 = fcmp olt undef, undef + %nxv4f16 = fcmp olt undef, undef + %nxv8f16 = fcmp olt undef, undef + %nxv16f16 = fcmp olt undef, undef + ret void +} +define void @fcmp_ole() { +; NOF16-LABEL: 'fcmp_ole' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ole undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ole' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ole' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ole undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ole <2 x half> undef, undef + %v4f16 = fcmp ole <4 x half> undef, undef + %v8f16 = fcmp ole <8 x half> undef, undef + %v16f16 = fcmp ole <16 x half> undef, undef + %nxv1f16 = fcmp ole undef, undef + %nxv2f16 = fcmp ole undef, undef + %nxv4f16 = fcmp ole undef, undef + %nxv8f16 = fcmp ole undef, undef + %nxv16f16 = fcmp ole undef, undef + ret void +} +define void @fcmp_ogt() { +; NOF16-LABEL: 'fcmp_ogt' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ogt undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ogt' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ogt' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ogt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ogt <2 x half> undef, undef + %v4f16 = fcmp ogt <4 x half> undef, undef + %v8f16 = fcmp ogt <8 x half> undef, undef + %v16f16 = fcmp ogt <16 x half> undef, undef + %nxv1f16 = fcmp ogt undef, undef + %nxv2f16 = fcmp ogt undef, undef + %nxv4f16 = fcmp ogt undef, undef + %nxv8f16 = fcmp ogt undef, undef + %nxv16f16 = fcmp ogt undef, undef + ret void +} +define void @fcmp_oge() { +; NOF16-LABEL: 'fcmp_oge' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oge undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_oge' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_oge' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp oge <2 x half> undef, undef + %v4f16 = fcmp oge <4 x half> undef, undef + %v8f16 = fcmp oge <8 x half> undef, undef + %v16f16 = fcmp oge <16 x half> undef, undef + %nxv1f16 = fcmp oge undef, undef + %nxv2f16 = fcmp oge undef, undef + %nxv4f16 = fcmp oge undef, undef + %nxv8f16 = fcmp oge undef, undef + %nxv16f16 = fcmp oge undef, undef + ret void +} +define void @fcmp_ueq() { +; NOF16-LABEL: 'fcmp_ueq' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ueq undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ueq' +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ueq' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ueq undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ueq <2 x half> undef, undef + %v4f16 = fcmp ueq <4 x half> undef, undef + %v8f16 = fcmp ueq <8 x half> undef, undef + %v16f16 = fcmp ueq <16 x half> undef, undef + %nxv1f16 = fcmp ueq undef, undef + %nxv2f16 = fcmp ueq undef, undef + %nxv4f16 = fcmp ueq undef, undef + %nxv8f16 = fcmp ueq undef, undef + %nxv16f16 = fcmp ueq undef, undef + ret void +} +define void @fcmp_une() { +; NOF16-LABEL: 'fcmp_une' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp une <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp une <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp une <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp une <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp une undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_une' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_une' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp une <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp une undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp une <2 x half> undef, undef + %v4f16 = fcmp une <4 x half> undef, undef + %v8f16 = fcmp une <8 x half> undef, undef + %v16f16 = fcmp une <16 x half> undef, undef + %nxv1f16 = fcmp une undef, undef + %nxv2f16 = fcmp une undef, undef + %nxv4f16 = fcmp une undef, undef + %nxv8f16 = fcmp une undef, undef + %nxv16f16 = fcmp une undef, undef + ret void +} +define void @fcmp_ult() { +; NOF16-LABEL: 'fcmp_ult' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ult undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ult' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ult' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ult undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ult <2 x half> undef, undef + %v4f16 = fcmp ult <4 x half> undef, undef + %v8f16 = fcmp ult <8 x half> undef, undef + %v16f16 = fcmp ult <16 x half> undef, undef + %nxv1f16 = fcmp ult undef, undef + %nxv2f16 = fcmp ult undef, undef + %nxv4f16 = fcmp ult undef, undef + %nxv8f16 = fcmp ult undef, undef + %nxv16f16 = fcmp ult undef, undef + ret void +} +define void @fcmp_ule() { +; NOF16-LABEL: 'fcmp_ule' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ule undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ule' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ule' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ule undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ule <2 x half> undef, undef + %v4f16 = fcmp ule <4 x half> undef, undef + %v8f16 = fcmp ule <8 x half> undef, undef + %v16f16 = fcmp ule <16 x half> undef, undef + %nxv1f16 = fcmp ule undef, undef + %nxv2f16 = fcmp ule undef, undef + %nxv4f16 = fcmp ule undef, undef + %nxv8f16 = fcmp ule undef, undef + %nxv16f16 = fcmp ule undef, undef + ret void +} +define void @fcmp_ugt() { +; NOF16-LABEL: 'fcmp_ugt' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ugt undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_ugt' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_ugt' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ugt undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp ugt <2 x half> undef, undef + %v4f16 = fcmp ugt <4 x half> undef, undef + %v8f16 = fcmp ugt <8 x half> undef, undef + %v16f16 = fcmp ugt <16 x half> undef, undef + %nxv1f16 = fcmp ugt undef, undef + %nxv2f16 = fcmp ugt undef, undef + %nxv4f16 = fcmp ugt undef, undef + %nxv8f16 = fcmp ugt undef, undef + %nxv16f16 = fcmp ugt undef, undef + ret void +} +define void @fcmp_uge() { +; NOF16-LABEL: 'fcmp_uge' +; NOF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp uge undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_uge' +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_uge' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp uge undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp uge <2 x half> undef, undef + %v4f16 = fcmp uge <4 x half> undef, undef + %v8f16 = fcmp uge <8 x half> undef, undef + %v16f16 = fcmp uge <16 x half> undef, undef + %nxv1f16 = fcmp uge undef, undef + %nxv2f16 = fcmp uge undef, undef + %nxv4f16 = fcmp uge undef, undef + %nxv8f16 = fcmp uge undef, undef + %nxv16f16 = fcmp uge undef, undef + ret void +} +define void @fcmp_true() { +; NOF16-LABEL: 'fcmp_true' +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v2f16 = fcmp true <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v4f16 = fcmp true <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v8f16 = fcmp true <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v16f16 = fcmp true <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_true' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_true' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp true <2 x half> undef, undef + %v4f16 = fcmp true <4 x half> undef, undef + %v8f16 = fcmp true <8 x half> undef, undef + %v16f16 = fcmp true <16 x half> undef, undef + %nxv1f16 = fcmp true undef, undef + %nxv2f16 = fcmp true undef, undef + %nxv4f16 = fcmp true undef, undef + %nxv8f16 = fcmp true undef, undef + %nxv16f16 = fcmp true undef, undef + ret void +} +define void @fcmp_false() { +; NOF16-LABEL: 'fcmp_false' +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v2f16 = fcmp false <2 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v4f16 = fcmp false <4 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v8f16 = fcmp false <8 x half> undef, undef +; NOF16-NEXT: Cost Model: Invalid cost for instruction: %v16f16 = fcmp false <16 x half> undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef +; NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFH-LABEL: 'fcmp_false' +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef +; VFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VFHMIN-LABEL: 'fcmp_false' +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false undef, undef +; VFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16 = fcmp false <2 x half> undef, undef + %v4f16 = fcmp false <4 x half> undef, undef + %v8f16 = fcmp false <8 x half> undef, undef + %v16f16 = fcmp false <16 x half> undef, undef + %nxv1f16 = fcmp false undef, undef + %nxv2f16 = fcmp false undef, undef + %nxv4f16 = fcmp false undef, undef + %nxv8f16 = fcmp false undef, undef + %nxv16f16 = fcmp false undef, undef + ret void +} From ad192f9f20ad48188b80855c085a0ad7266e0056 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 3 Jan 2025 20:44:43 +1100 Subject: [PATCH 064/480] [ORC] Restrict check-dwarf-filename test to Darwin for now. This test is failing on Windows (see e.g. https://lab.llvm.org/buildbot/#/builders/146/builds/1983), probably due to incomplete debugger support there (the test registers debug info in-process, so non-Darwin builds shouldn't be expected to have the right symbols). --- .../ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s index a2eee21a0761d..df44ce996ecad 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s @@ -2,7 +2,7 @@ # RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \ # RUN: FileCheck %s # -# REQUIRES: asserts +# REQUIRES: asserts && system-darwin # # Test that source file names can be indentified from DWARF line tables. From cdad18319425a7bf93cc25b276a7961fe5b1168b Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Fri, 3 Jan 2025 11:17:16 +0100 Subject: [PATCH 065/480] [clang] Fix #embed "fast path" (#121479) When a single #embed directive is used to initialize a char array, the case is optimized via swap of EmbedExpr to underlying StringLiteral, resulting in better performance in AST consumers. While browsing through the code, I realized that 7122b70cfc8e23a069410215c363da76d842bda4 which changed type of EmbedExpr made the "fast path" unreachable. This patch fixes this unfortunate situation. --- clang/lib/Sema/SemaInit.cpp | 9 ++------- clang/test/Analysis/embed.c | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 5909457b04e66..0dd5f468cf60b 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -2030,13 +2030,8 @@ canInitializeArrayWithEmbedDataString(ArrayRef ExprList, if (InitType->isArrayType()) { const ArrayType *InitArrayType = InitType->getAsArrayTypeUnsafe(); - QualType InitElementTy = InitArrayType->getElementType(); - QualType EmbedExprElementTy = EE->getDataStringLiteral()->getType(); - const bool TypesMatch = - Context.typesAreCompatible(InitElementTy, EmbedExprElementTy) || - (InitElementTy->isCharType() && EmbedExprElementTy->isCharType()); - if (TypesMatch) - return true; + StringLiteral *SL = EE->getDataStringLiteral(); + return IsStringInit(SL, InitArrayType, Context) == SIF_None; } return false; } diff --git a/clang/test/Analysis/embed.c b/clang/test/Analysis/embed.c index 32f6c13032574..db8c270fb35de 100644 --- a/clang/test/Analysis/embed.c +++ b/clang/test/Analysis/embed.c @@ -8,5 +8,5 @@ int main() { #embed "embed.c" }; clang_analyzer_dump_ptr(SelfBytes); // expected-warning {{&Element{SelfBytes,0 S64b,unsigned char}}} - clang_analyzer_dump(SelfBytes[0]); // expected-warning {{Unknown}} FIXME: This should be the `/` character. + clang_analyzer_dump(SelfBytes[0]); // expected-warning {{47 U8b}} } From e3ec5a728674fd775bb85a7d159acdb4fa1d69c2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Jan 2025 10:29:07 +0000 Subject: [PATCH 066/480] [VectorCombine] foldShuffleOfBinops - fold shuffle(binop(shuffle(x),shuffle(z)),binop(shuffle(y),shuffle(w)) -> binop(shuffle(x,z),shuffle(y,w)) (#120984) Some patterns (in particular horizontal style patterns) can end up with shuffles straddling both sides of a binop/cmp. Where individually the folds aren't worth it, by merging the (oneuse) shuffles we can notably reduce the net instruction count and cost. One of the final steps towards finally addressing #34072 --- .../Transforms/Vectorize/VectorCombine.cpp | 34 +++- .../test/Transforms/PhaseOrdering/X86/hadd.ll | 187 ++++++------------ .../Transforms/PhaseOrdering/X86/pr50392.ll | 9 +- .../X86/extract-binop-inseltpoison.ll | 11 +- 4 files changed, 95 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 493ed95b1d22e..9bca613593591 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1743,6 +1743,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); + // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns + // where one use shuffles have gotten split across the binop/cmp. These + // often allow a major reduction in total cost that wouldn't happen as + // individual folds. + auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef Mask, + TTI::TargetCostKind CostKind) -> bool { + Value *InnerOp; + ArrayRef InnerMask; + if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(), + m_Mask(InnerMask)))) && + InnerOp->getType() == Op->getType() && + all_of(InnerMask, + [NumSrcElts](int M) { return M < (int)NumSrcElts; })) { + for (int &M : Mask) + if (Offset <= M && M < (int)(Offset + NumSrcElts)) { + M = InnerMask[M - Offset]; + M = 0 <= M ? M + Offset : M; + } + OldCost += TTI.getInstructionCost(cast(Op), CostKind); + Op = InnerOp; + return true; + } + return false; + }; + bool ReducedInstCount = false; + ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind); + ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind); + ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind); + ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind); + InstructionCost NewCost = TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); @@ -1763,8 +1793,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { // If either shuffle will constant fold away, then fold for the same cost as // we will reduce the instruction count. - bool ReducedInstCount = (isa(X) && isa(Z)) || - (isa(Y) && isa(W)); + ReducedInstCount |= (isa(X) && isa(Z)) || + (isa(Y) && isa(W)); if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) return false; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll index 798824bce4dac..67da29b6cee7d 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll @@ -78,30 +78,16 @@ define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) { ; SSE2-NEXT: ret <8 x i16> [[RESULT]] ; ; SSE4-LABEL: @add_v8i16_u1234567( -; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]] -; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> -; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> +; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> ; SSE4-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> -; SSE4-NEXT: ret <8 x i16> [[RESULT]] +; SSE4-NEXT: ret <8 x i16> [[TMP7]] ; ; AVX-LABEL: @add_v8i16_u1234567( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> ; AVX-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> -; AVX-NEXT: ret <8 x i16> [[RESULT]] +; AVX-NEXT: ret <8 x i16> [[TMP7]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 @@ -172,13 +158,10 @@ define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @add_v4i32_u123( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[RESULT1]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -202,13 +185,10 @@ define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @add_v4i32_0u23( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[RESULT1]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -232,40 +212,28 @@ define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: @add_v4i32_01u3( -; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; SSE2-NEXT: ret <4 x i32> [[RESULT1]] +; SSE2-NEXT: ret <4 x i32> [[TMP4]] ; ; SSE4-LABEL: @add_v4i32_01u3( -; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; SSE4-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]] -; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; SSE4-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> -; SSE4-NEXT: ret <4 x i32> [[RESULT]] +; SSE4-NEXT: ret <4 x i32> [[TMP4]] ; ; AVX2-LABEL: @add_v4i32_01u3( -; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> -; AVX2-NEXT: ret <4 x i32> [[RESULT]] +; AVX2-NEXT: ret <4 x i32> [[TMP4]] ; ; AVX512-LABEL: @add_v4i32_01u3( -; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; AVX512-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; AVX512-NEXT: ret <4 x i32> [[RESULT1]] +; AVX512-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -289,13 +257,10 @@ define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @add_v4i32_012u( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[RESULT1]] +; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -420,17 +385,14 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: @add_v8i32_01234u67( -; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]] ; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> ; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> ; SSE2-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]] -; SSE2-NEXT: [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> ; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> -; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> +; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <8 x i32> ; SSE2-NEXT: ret <8 x i32> [[RESULT]] ; ; SSE4-LABEL: @add_v8i32_01234u67( @@ -449,17 +411,10 @@ define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) { ; SSE4-NEXT: ret <8 x i32> [[RESULT]] ; ; AVX-LABEL: @add_v8i32_01234u67( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> ; AVX-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> -; AVX-NEXT: ret <8 x i32> [[RESULT]] +; AVX-NEXT: ret <8 x i32> [[TMP7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -530,13 +485,10 @@ define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) { define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @add_v4f32_u123( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[RESULT1]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -599,22 +551,16 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) { ; SSE2-NEXT: ret <4 x float> [[RESULT1]] ; ; SSE4-LABEL: @add_v4f32_01u3( -; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; SSE4-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]] -; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> -; SSE4-NEXT: ret <4 x float> [[RESULT]] +; SSE4-NEXT: ret <4 x float> [[TMP4]] ; ; AVX2-LABEL: @add_v4f32_01u3( -; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> -; AVX2-NEXT: ret <4 x float> [[RESULT]] +; AVX2-NEXT: ret <4 x float> [[TMP4]] ; ; AVX512-LABEL: @add_v4f32_01u3( ; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> @@ -820,17 +766,10 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) { ; SSE-NEXT: ret <8 x float> [[RESULT]] ; ; AVX-LABEL: @add_v8f32_012u4567( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; AVX-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[RESULT]] +; AVX-NEXT: ret <8 x float> [[TMP7]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -983,13 +922,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_u123( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -1034,13 +970,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_0u23( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -1085,13 +1018,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_01u3( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -1136,13 +1066,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[RESULT]] ; ; AVX-LABEL: @add_v4f64_012u( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> -; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[RESULT]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll index 4e1051d1991aa..d92df9741644b 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll @@ -31,13 +31,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: ret <4 x double> [[SHUFFLE]] ; ; AVX-LABEL: @PR50392( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; AVX-NEXT: [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B1]], <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[SHUFFLE]] +; AVX-NEXT: ret <4 x double> [[TMP4]] ; %vecext = extractelement <4 x double> %a, i32 0 %vecext1 = extractelement <4 x double> %a, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index 800f57646a3e1..6ef18e66d4211 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -468,15 +468,10 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: @PR34724( ; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; SSE-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; SSE-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] -; SSE-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1:%.*]], <4 x i32> +; SSE-NEXT: [[B:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1]], <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] -; SSE-NEXT: [[V2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; SSE-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> -; SSE-NEXT: ret <4 x float> [[V3]] +; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @PR34724( ; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 From d3eb65f15dfda454424125b2fa675378bd350889 Mon Sep 17 00:00:00 2001 From: Kaviya Rajendiran <67495422+kaviya2510@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:22:38 +0530 Subject: [PATCH 067/480] [MLIR][OpenMP] Lowering aligned clause to LLVM IR for SIMD directive (#119536) This patch, - Added a translation support for aligned clause in SIMD directive by passing the alignment details to "llvm.assume" intrinsic. - Updated the insertion point for llvm.assume intrinsic call in "OMPIRBuilder.cpp". - Added a check in aligned clause MLIR lowering, to ensure that the alignment value must be a power of 2. --- clang/test/OpenMP/irbuilder_simd_aligned.cpp | 6 +- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 2 + llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 +- .../Frontend/OpenMPIRBuilderTest.cpp | 8 +-- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 23 +++++-- .../Target/LLVMIR/openmp-simd-aligned.mlir | 60 +++++++++++++++++++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 12 ---- 7 files changed, 89 insertions(+), 25 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir diff --git a/clang/test/OpenMP/irbuilder_simd_aligned.cpp b/clang/test/OpenMP/irbuilder_simd_aligned.cpp index 1c3dc49b717ed..721fde6d95495 100644 --- a/clang/test/OpenMP/irbuilder_simd_aligned.cpp +++ b/clang/test/OpenMP/irbuilder_simd_aligned.cpp @@ -70,8 +70,11 @@ void simple(float *a, float *b, int *c) { // CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK: for.end: // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 128) ] // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[P]], align 8 +// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 64) ] // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [32 x i32], ptr [[D]], i64 0, i64 0 +// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAYDECAY]], i64 16) ] // CHECK-NEXT: store i32 3, ptr [[I1]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 // CHECK-NEXT: store ptr [[I1]], ptr [[TMP6]], align 8 @@ -82,9 +85,6 @@ void simple(float *a, float *b, int *c) { // CHECK-NEXT: [[DOTCOUNT:%.*]] = load i32, ptr [[DOTCOUNT_ADDR]], align 4 // CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] // CHECK: omp_loop.preheader: -// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 128) ] -// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 64) ] -// CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAYDECAY]], i64 16) ] // CHECK-NEXT: br label [[OMP_LOOP_HEADER:%.*]] // CHECK: omp_loop.header: // CHECK-NEXT: [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ] diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index c4ab5e0033d04..fb8e007c7af57 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -613,6 +613,8 @@ addAlignedClause(lower::AbstractConverter &converter, // Do not generate alignment assumption if alignment is less than or equal to // 0. if (alignment > 0) { + // alignment value must be power of 2 + assert((alignment & (alignment - 1)) == 0 && "alignment is not power of 2"); auto &objects = std::get(clause.t); if (!objects.empty()) genObjectList(objects, converter, alignedVars); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0d8dbbe3a8a71..8dbf2aa7e0a24 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5302,10 +5302,11 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); if (AlignedVars.size()) { InsertPointTy IP = Builder.saveIP(); - Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator()); for (auto &AlignedItem : AlignedVars) { Value *AlignedPtr = AlignedItem.first; Value *Alignment = AlignedItem.second; + Instruction *loadInst = dyn_cast(AlignedPtr); + Builder.SetInsertPoint(loadInst->getNextNode()); Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr, Alignment); } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index d7ac108249118..9faae88b8dbc7 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1993,6 +1993,7 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdCustomAligned) { OpenMPIRBuilder OMPBuilder(*M); IRBuilder<> Builder(BB); const int AlignmentValue = 32; + llvm::BasicBlock *sourceBlock = Builder.GetInsertBlock(); AllocaInst *Alloc1 = Builder.CreateAlloca(Builder.getPtrTy(), Builder.getInt64(1)); LoadInst *Load1 = Builder.CreateLoad(Alloc1->getAllocatedType(), Alloc1); @@ -2031,13 +2032,12 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdCustomAligned) { // Check if number of assumption instructions is equal to number of aligned // variables - BasicBlock *LoopPreheader = CLI->getPreheader(); - size_t NumAssummptionCallsInPreheader = count_if( - *LoopPreheader, [](Instruction &I) { return isa(I); }); + size_t NumAssummptionCallsInPreheader = + count_if(*sourceBlock, [](Instruction &I) { return isa(I); }); EXPECT_EQ(NumAssummptionCallsInPreheader, AlignedVars.size()); // Check if variables are correctly aligned - for (Instruction &Instr : *LoopPreheader) { + for (Instruction &Instr : *sourceBlock) { if (!isa(Instr)) continue; AssumeInst *AssumeInstruction = cast(&Instr); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 9a30266103b15..ce129417fc5b2 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -150,10 +150,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { << " operation"; }; - auto checkAligned = [&todo](auto op, LogicalResult &result) { - if (!op.getAlignedVars().empty() || op.getAlignments()) - result = todo("aligned"); - }; auto checkAllocate = [&todo](auto op, LogicalResult &result) { if (!op.getAllocateVars().empty() || !op.getAllocatorVars().empty()) result = todo("allocate"); @@ -275,7 +271,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { }) .Case([&](omp::ParallelOp op) { checkAllocate(op, result); }) .Case([&](omp::SimdOp op) { - checkAligned(op, result); checkLinear(op, result); checkNontemporal(op, result); checkPrivate(op, result); @@ -2302,6 +2297,24 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, llvm::MapVector alignedVars; llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrder()); + llvm::BasicBlock *sourceBlock = builder.GetInsertBlock(); + std::optional alignmentValues = simdOp.getAlignments(); + mlir::OperandRange operands = simdOp.getAlignedVars(); + for (size_t i = 0; i < operands.size(); ++i) { + llvm::Value *alignment = nullptr; + llvm::Value *llvmVal = moduleTranslation.lookupValue(operands[i]); + llvm::Type *ty = llvmVal->getType(); + if (auto intAttr = llvm::dyn_cast((*alignmentValues)[i])) { + alignment = builder.getInt64(intAttr.getInt()); + assert(ty->isPointerTy() && "Invalid type for aligned variable"); + assert(alignment && "Invalid alignment value"); + auto curInsert = builder.saveIP(); + builder.SetInsertPoint(sourceBlock->getTerminator()); + llvmVal = builder.CreateLoad(ty, llvmVal); + builder.restoreIP(curInsert); + alignedVars[llvmVal] = alignment; + } + } ompBuilder->applySimd(loopInfo, alignedVars, simdOp.getIfExpr() ? moduleTranslation.lookupValue(simdOp.getIfExpr()) diff --git a/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir b/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir new file mode 100644 index 0000000000000..234604e4b664a --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir @@ -0,0 +1,60 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +//CHECK-LABEL: define void @_QPsimd_aligned_pointer() { +//CHECK: %[[A_PTR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 +//CHECK: %[[A_VAL:.*]] = load ptr, ptr %[[A_PTR]], align 8 +//CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ] +llvm.func @_QPsimd_aligned_pointer() { + %1 = llvm.mlir.constant(1 : i64) : i64 + %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "x"} : (i64) -> !llvm.ptr + %3 = llvm.alloca %1 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.mlir.constant(10 : i32) : i32 + %6 = llvm.mlir.constant(1 : i32) : i32 + omp.simd aligned(%2 : !llvm.ptr -> 256 : i64) { + omp.loop_nest (%arg0) : i32 = (%4) to (%5) inclusive step (%6) { + llvm.store %arg0, %3 : i32, !llvm.ptr + omp.yield + } + } + llvm.return +} + +//CHECK-LABEL: define void @_QPsimd_aligned_cptr() { +//CHECK: %[[A_CPTR:.*]] = alloca %_QM__fortran_builtinsT__builtin_c_ptr, i64 1, align 8 +//CHECK: %[[A_VAL:.*]] = load ptr, ptr %[[A_CPTR]], align 8 +//CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ] +llvm.func @_QPsimd_aligned_cptr() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<"_QM__fortran_builtinsT__builtin_c_ptr", (i64)> {bindc_name = "a"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.mlir.constant(10 : i32) : i32 + %6 = llvm.mlir.constant(1 : i32) : i32 + omp.simd aligned(%1 : !llvm.ptr -> 256 : i64) { + omp.loop_nest (%arg0) : i32 = (%4) to (%5) inclusive step (%6) { + llvm.store %arg0, %3 : i32, !llvm.ptr + omp.yield + } + } + llvm.return +} + +//CHECK-LABEL: define void @_QPsimd_aligned_allocatable() { +//CHECK: %[[A_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +//CHECK: %[[A_VAL:.*]] = load ptr, ptr %[[A_ADDR]], align 8 +//CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ] +llvm.func @_QPsimd_aligned_allocatable() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "a"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.mlir.constant(10 : i32) : i32 + %4 = llvm.mlir.constant(1 : i32) : i32 + omp.simd aligned(%1 : !llvm.ptr -> 256 : i64) { + omp.loop_nest (%arg0) : i32 = (%2) to (%3) inclusive step (%4) { + omp.yield + } + } + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 8f3e466cfbbeb..83a0990d63162 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -127,18 +127,6 @@ llvm.func @sections_private(%x : !llvm.ptr) { llvm.return } -// ----- - -llvm.func @simd_aligned(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { - // expected-error@below {{not yet implemented: Unhandled clause aligned in omp.simd operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.simd}} - omp.simd aligned(%x : !llvm.ptr -> 32) { - omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { - omp.yield - } - } - llvm.return -} // ----- From 2e41489d7b1498ec8a18b99e6d7db9e946f2d786 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Fri, 3 Jan 2025 19:10:43 +0800 Subject: [PATCH 068/480] [Clang] Fix unexpanded packs in NTTP type constraints (#121296) In the case where a type-constraint on an NTTP contains a pack, we form a PackExpansionType to model it. However, there are a few places expecting it to be a non-pack expansion, and luckily only small changes could make them work. Fixes https://github.com/llvm/llvm-project/issues/88866 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/AST/ASTContext.cpp | 2 +- clang/lib/Sema/SemaTemplate.cpp | 16 ++++- clang/lib/Sema/SemaTemplateDeduction.cpp | 5 +- clang/test/SemaCXX/cxx2c-fold-exprs.cpp | 79 ++++++++++++++++++++++++ 5 files changed, 98 insertions(+), 5 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2789a24ebf273..61d6aa2216cd0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -886,6 +886,7 @@ Bug Fixes to C++ Support out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218) - Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042) - Fixed a bug where captured structured bindings were modifiable inside non-mutable lambda (#GH95081) +- Clang now identifies unexpanded parameter packs within the type constraint on a non-type template parameter. (#GH88866) - Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242) Bug Fixes to AST Handling diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 8b4ae58e8427a..a9ecb4ee9c76b 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -6376,7 +6376,7 @@ ASTContext::getAutoType(QualType DeducedType, AutoTypeKeyword Keyword, } QualType ASTContext::getUnconstrainedType(QualType T) const { - QualType CanonT = T.getCanonicalType(); + QualType CanonT = T.getNonPackExpansionType().getCanonicalType(); // Remove a type-constraint from a top-level auto or decltype(auto). if (auto *AT = CanonT->getAs()) { diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 5e7a3c8484c88..20ec2fbeaa6a8 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1228,7 +1228,7 @@ bool Sema::AttachTypeConstraint(AutoTypeLoc TL, NonTypeTemplateParmDecl *NewConstrainedParm, NonTypeTemplateParmDecl *OrigConstrainedParm, SourceLocation EllipsisLoc) { - if (NewConstrainedParm->getType() != TL.getType() || + if (NewConstrainedParm->getType().getNonPackExpansionType() != TL.getType() || TL.getAutoKeyword() != AutoTypeKeyword::Auto) { Diag(NewConstrainedParm->getTypeSourceInfo()->getTypeLoc().getBeginLoc(), diag::err_unsupported_placeholder_constraint) @@ -1530,9 +1530,19 @@ NamedDecl *Sema::ActOnNonTypeTemplateParameter(Scope *S, Declarator &D, Param->setAccess(AS_public); if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc()) - if (TL.isConstrained()) - if (AttachTypeConstraint(TL, Param, Param, D.getEllipsisLoc())) + if (TL.isConstrained()) { + if (D.getEllipsisLoc().isInvalid() && + T->containsUnexpandedParameterPack()) { + assert(TL.getConceptReference()->getTemplateArgsAsWritten()); + for (auto &Loc : + TL.getConceptReference()->getTemplateArgsAsWritten()->arguments()) + Invalid |= DiagnoseUnexpandedParameterPack( + Loc, UnexpandedParameterPackContext::UPPC_TypeConstraint); + } + if (!Invalid && + AttachTypeConstraint(TL, Param, Param, D.getEllipsisLoc())) Invalid = true; + } if (Invalid) Param->setInvalidDecl(); diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index fad20b37a7d9a..1c1f6e30ab7b8 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -857,7 +857,10 @@ class PackDeductionScope { if (auto *NTTP = dyn_cast( TemplateParams->getParam(Index))) { if (!NTTP->isExpandedParameterPack()) - if (auto *Expansion = dyn_cast(NTTP->getType())) + // FIXME: CWG2982 suggests a type-constraint forms a non-deduced + // context, however it is not yet resolved. + if (auto *Expansion = dyn_cast( + S.Context.getUnconstrainedType(NTTP->getType()))) ExtraDeductions.push_back(Expansion->getPattern()); } // FIXME: Also collect the unexpanded packs in any type and template diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp index 0674135aac483..48061439941f2 100644 --- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp +++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp @@ -305,3 +305,82 @@ static_assert(__is_same_as(_Three_way_comparison_result_with_tuple_like, 0>::type, long)); } + +namespace GH88866 { + +template struct index_by; + +template +concept InitFunc = true; + +namespace ExpandsBoth { + +template auto... init> +struct LazyLitMatrix; // expected-note {{here}} + +template < + typename...Indices, + InitFunc> auto... init +> +struct LazyLitMatrix, init...> { +}; + +// FIXME: Explain why we didn't pick up the partial specialization - pack sizes don't match. +template struct LazyLitMatrix, 42>; +// expected-error@-1 {{instantiation of undefined template}} +template struct LazyLitMatrix, 42, 43>; + +} + +namespace ExpandsRespectively { + +template auto... init> +struct LazyLitMatrix; + +template < + typename...Indices, + InitFunc> auto... init +> +struct LazyLitMatrix, init...> { +}; + +template struct LazyLitMatrix, 42>; +template struct LazyLitMatrix, 42, 43>; + +} + +namespace TypeParameter { + +template ... init> +struct LazyLitMatrix; // expected-note {{here}} + +template < + typename...Indices, + InitFunc>... init +> +struct LazyLitMatrix, init...> { +}; + +// FIXME: Explain why we didn't pick up the partial specialization - pack sizes don't match. +template struct LazyLitMatrix, float>; +// expected-error@-1 {{instantiation of undefined template}} +template struct LazyLitMatrix, unsigned, float>; + +} + +namespace Invalid { + +template ... init> +struct LazyLitMatrix; + +template < + typename...Indices, + InitFunc> init + // expected-error@-1 {{unexpanded parameter pack 'Indices'}} +> +struct LazyLitMatrix, init> { +}; + +} + +} From 85849917f7ba19f6906f64726dc5e7101f8984ce Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Fri, 3 Jan 2025 11:16:34 +0000 Subject: [PATCH 069/480] [compiler-rt][rtsan] Reland "fopencookie support." (#120864) (#121547) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 12 +++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 25 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 4e51f464b5730..9f89ab6bf1fc7 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -297,6 +297,17 @@ INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { return REAL(fdopen)(fd, mode); } +#if SANITIZER_INTERCEPT_FOPENCOOKIE +INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode, + cookie_io_functions_t funcs) { + __rtsan_notify_intercepted_call("fopencookie"); + return REAL(fopencookie)(cookie, mode, funcs); +} +#define RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE INTERCEPT_FUNCTION(fopencookie) +#else +#define RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE +#endif + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) { __rtsan_notify_intercepted_call("open_memstream"); @@ -972,6 +983,7 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(fputs); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); + RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE; RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; INTERCEPT_FUNCTION(lseek); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index b052dd859dcdf..5adbf0fb63de8 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -353,6 +353,31 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +#if SANITIZER_INTERCEPT_FOPENCOOKIE +TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + struct fholder { + FILE *fp; + size_t read; + } fh = {f, 0}; + auto CookieRead = [this](void *cookie, char *buf, size_t size) { + fholder *p = reinterpret_cast(cookie); + p->read = fread(static_cast(buf), 1, size, p->fp); + EXPECT_NE(0, p->read); + }; + cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, + nullptr, nullptr}; + auto Func = [&fh, &funcs]() { + FILE *f = fopencookie(&fh, "w", funcs); + EXPECT_THAT(f, Ne(nullptr)); + }; + + ExpectRealtimeDeath(Func, "fopencookie"); + ExpectNonRealtimeSurvival(Func); +} +#endif + #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) { char *buffer; From 579ced4f8266b273d15b2801067a828151a222ef Mon Sep 17 00:00:00 2001 From: Hugo Trachino Date: Fri, 3 Jan 2025 11:21:59 +0000 Subject: [PATCH 070/480] [MLIR][Python] Add structured.fuseop to python interpreter (#120601) Implements a python interface for structured.fuseOp allowing more freedom with inputs. --- .../mlir/dialects/transform/structured.py | 71 +++++++++++++++++++ .../dialects/transform_structured_ext.py | 36 ++++++++++ 2 files changed, 107 insertions(+) diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py index 9121aa8e40237..bf40cc532065d 100644 --- a/mlir/python/mlir/dialects/transform/structured.py +++ b/mlir/python/mlir/dialects/transform/structured.py @@ -140,6 +140,77 @@ def __init__( ) +@_ods_cext.register_operation(_Dialect, replace=True) +class FuseOp(FuseOp): + """Specialization for FuseOp class.""" + + @overload + def __init__( + self, + loop_types: Union[Type, Sequence[Type]], + target: Union[Operation, Value, OpView], + *, + tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, + tile_interchange: OptionalIntList = None, + apply_cleanup: Optional[bool] = False, + loc=None, + ip=None, + ): + ... + + @overload + def __init__( + self, + target: Union[Operation, Value, OpView], + *, + tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, + tile_interchange: OptionalIntList = None, + apply_cleanup: Optional[bool] = False, + loc=None, + ip=None, + ): + ... + + def __init__( + self, + loop_types_or_target: Union[Type, Sequence[Type], Operation, OpView, Value], + target_or_none: Optional[Union[Operation, Value, OpView]] = None, + *, + tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, + tile_interchange: OptionalIntList = None, + apply_cleanup: Optional[bool] = False, + loc=None, + ip=None, + ): + tile_sizes = tile_sizes if tile_sizes else [] + tile_interchange = tile_interchange if tile_interchange else [] + _, tile_sizes, _ = _dispatch_dynamic_index_list(tile_sizes) + _, tile_interchange, _ = _dispatch_dynamic_index_list(tile_interchange) + num_loops = sum(0 if v == 0 else 1 for v in tile_sizes) + + if isinstance(loop_types_or_target, (Operation, Value, OpView)): + loop_types = [transform.AnyOpType.get()] * num_loops + target = loop_types_or_target + assert target_or_none is None, "Cannot construct FuseOp with two targets." + else: + loop_types = ( + ([loop_types_or_target] * num_loops) + if isinstance(loop_types_or_target, Type) + else loop_types_or_target + ) + target = target_or_none + super().__init__( + target.type, + loop_types, + target, + tile_sizes=tile_sizes, + tile_interchange=tile_interchange, + apply_cleanup=apply_cleanup, + loc=loc, + ip=ip, + ) + + @_ods_cext.register_operation(_Dialect, replace=True) class GeneralizeOp(GeneralizeOp): """Specialization for GeneralizeOp class.""" diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py index fb4c75b533792..8785d6d360074 100644 --- a/mlir/test/python/dialects/transform_structured_ext.py +++ b/mlir/test/python/dialects/transform_structured_ext.py @@ -101,6 +101,42 @@ def testFuseIntoContainingOpCompact(target): # CHECK-SAME: (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) +@run +@create_sequence +def testFuseOpCompact(target): + structured.FuseOp( + target, tile_sizes=[4, 8], tile_interchange=[0, 1], apply_cleanup=True + ) + # CHECK-LABEL: TEST: testFuseOpCompact + # CHECK: transform.sequence + # CHECK: %{{.+}}, %{{.+}}:2 = transform.structured.fuse %{{.*}}[4, 8] + # CHECK-SAME: interchange [0, 1] apply_cleanup = true + # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + +@run +@create_sequence +def testFuseOpNoArg(target): + structured.FuseOp(target) + # CHECK-LABEL: TEST: testFuseOpNoArg + # CHECK: transform.sequence + # CHECK: %{{.+}} = transform.structured.fuse %{{.*}} : + # CHECK-SAME: (!transform.any_op) -> !transform.any_op + + +@run +@create_sequence +def testFuseOpAttributes(target): + attr = DenseI64ArrayAttr.get([4, 8]) + ichange = DenseI64ArrayAttr.get([0, 1]) + structured.FuseOp(target, tile_sizes=attr, tile_interchange=ichange) + # CHECK-LABEL: TEST: testFuseOpAttributes + # CHECK: transform.sequence + # CHECK: %{{.+}}, %{{.+}}:2 = transform.structured.fuse %{{.*}}[4, 8] + # CHECK-SAME: interchange [0, 1] + # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + @run @create_sequence def testGeneralize(target): From f87a9db8322643ccbc324e317a75b55903129b55 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 3 Jan 2025 11:28:31 +0000 Subject: [PATCH 071/480] [ARM] Expand fp64 bf16 converts similarly to f32 This helps with +fp64 targets where the f64s are legal and not previously lowered. It can treat fpextends as a shift + cvt and fptrunc can use a libcall. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 + llvm/test/CodeGen/Thumb2/bf16-instructions.ll | 65 +++++++++++-------- 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 5ec2d8389c18e..2e517c21fc4a8 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -806,7 +806,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::bf16, Custom); } else { setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom); } for (MVT VT : MVT::fixedlen_vector_valuetypes()) { diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll index 5de7afca25b84..786e35517fd7c 100644 --- a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll +++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP -; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP +; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FPNO64 +; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16,+fp64 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FP64 define bfloat @test_fadd(bfloat %a, bfloat %b) { ; CHECK-NOFP-LABEL: test_fadd: @@ -259,9 +260,8 @@ define void @test_truncstore64(double %a, ptr %b) { ; CHECK-FP-NEXT: .save {r4, lr} ; CHECK-FP-NEXT: push {r4, lr} ; CHECK-FP-NEXT: mov r4, r0 -; CHECK-FP-NEXT: vmov r0, r1, d0 -; CHECK-FP-NEXT: bl __aeabi_d2f -; CHECK-FP-NEXT: lsrs r0, r0, #16 +; CHECK-FP-NEXT: bl __truncdfbf2 +; CHECK-FP-NEXT: vmov r0, s0 ; CHECK-FP-NEXT: strh r0, [r4] ; CHECK-FP-NEXT: pop {r4, pc} %r = fptrunc double %a to bfloat @@ -312,15 +312,23 @@ define double @test_loadext64(ptr %a) { ; CHECK-NOFP-NEXT: bl __aeabi_f2d ; CHECK-NOFP-NEXT: pop {r7, pc} ; -; CHECK-FP-LABEL: test_loadext64: -; CHECK-FP: @ %bb.0: -; CHECK-FP-NEXT: .save {r7, lr} -; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: ldrh r0, [r0] -; CHECK-FP-NEXT: lsls r0, r0, #16 -; CHECK-FP-NEXT: bl __aeabi_f2d -; CHECK-FP-NEXT: vmov d0, r0, r1 -; CHECK-FP-NEXT: pop {r7, pc} +; CHECK-FPNO64-LABEL: test_loadext64: +; CHECK-FPNO64: @ %bb.0: +; CHECK-FPNO64-NEXT: .save {r7, lr} +; CHECK-FPNO64-NEXT: push {r7, lr} +; CHECK-FPNO64-NEXT: ldrh r0, [r0] +; CHECK-FPNO64-NEXT: lsls r0, r0, #16 +; CHECK-FPNO64-NEXT: bl __aeabi_f2d +; CHECK-FPNO64-NEXT: vmov d0, r0, r1 +; CHECK-FPNO64-NEXT: pop {r7, pc} +; +; CHECK-FP64-LABEL: test_loadext64: +; CHECK-FP64: @ %bb.0: +; CHECK-FP64-NEXT: ldrh r0, [r0] +; CHECK-FP64-NEXT: lsls r0, r0, #16 +; CHECK-FP64-NEXT: vmov s0, r0 +; CHECK-FP64-NEXT: vcvt.f64.f32 d0, s0 +; CHECK-FP64-NEXT: bx lr %r = load bfloat, ptr %a %d = fpext bfloat %r to double ret double %d @@ -1374,10 +1382,7 @@ define bfloat @test_fptrunc_double(double %a) { ; CHECK-FP: @ %bb.0: ; CHECK-FP-NEXT: .save {r7, lr} ; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: vmov r0, r1, d0 -; CHECK-FP-NEXT: bl __aeabi_d2f -; CHECK-FP-NEXT: lsrs r0, r0, #16 -; CHECK-FP-NEXT: vmov.f16 s0, r0 +; CHECK-FP-NEXT: bl __truncdfbf2 ; CHECK-FP-NEXT: vmov.f16 r0, s0 ; CHECK-FP-NEXT: vmov s0, r0 ; CHECK-FP-NEXT: pop {r7, pc} @@ -1410,15 +1415,23 @@ define double @test_fpext_double(bfloat %a) { ; CHECK-NOFP-NEXT: bl __aeabi_f2d ; CHECK-NOFP-NEXT: pop {r7, pc} ; -; CHECK-FP-LABEL: test_fpext_double: -; CHECK-FP: @ %bb.0: -; CHECK-FP-NEXT: .save {r7, lr} -; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: vmov r0, s0 -; CHECK-FP-NEXT: lsls r0, r0, #16 -; CHECK-FP-NEXT: bl __aeabi_f2d -; CHECK-FP-NEXT: vmov d0, r0, r1 -; CHECK-FP-NEXT: pop {r7, pc} +; CHECK-FPNO64-LABEL: test_fpext_double: +; CHECK-FPNO64: @ %bb.0: +; CHECK-FPNO64-NEXT: .save {r7, lr} +; CHECK-FPNO64-NEXT: push {r7, lr} +; CHECK-FPNO64-NEXT: vmov r0, s0 +; CHECK-FPNO64-NEXT: lsls r0, r0, #16 +; CHECK-FPNO64-NEXT: bl __aeabi_f2d +; CHECK-FPNO64-NEXT: vmov d0, r0, r1 +; CHECK-FPNO64-NEXT: pop {r7, pc} +; +; CHECK-FP64-LABEL: test_fpext_double: +; CHECK-FP64: @ %bb.0: +; CHECK-FP64-NEXT: vmov r0, s0 +; CHECK-FP64-NEXT: lsls r0, r0, #16 +; CHECK-FP64-NEXT: vmov s0, r0 +; CHECK-FP64-NEXT: vcvt.f64.f32 d0, s0 +; CHECK-FP64-NEXT: bx lr %r = fpext bfloat %a to double ret double %r } From 5cf138cfbaa8040100fed1d0d5e0a189759b24ab Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 3 Jan 2025 12:43:31 +0000 Subject: [PATCH 072/480] [llvm][JITLink][LoongArch] Fix bit extraction on 32 bit platforms This shifted `1UL` to make the mask. On 32 bit Linux UL is 32 bit, so if Hi+1 was >= 32 then you'd get the wrong result here. The other version of this uses 1ULL, but using the uint64_t typename here saves someone going to check what ULL means on different platforms. This fixes test failures seen on Linaro's 32 bit bots: https://lab.llvm.org/buildbot/#/builders/39/builds/3700 https://lab.llvm.org/buildbot/#/builders/122/builds/781 Though I cannot say exactly why this fixes them. Does not seem like the new code was triggering this problem, but somehow it must be. --- llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h index d31c749bad1b1..1db4b82218109 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h @@ -233,7 +233,7 @@ const char *getEdgeKindName(Edge::Kind K); // Returns extract bits Val[Hi:Lo]. inline uint32_t extractBits(uint64_t Val, unsigned Hi, unsigned Lo) { - return Hi == 63 ? Val >> Lo : (Val & (((1UL << (Hi + 1)) - 1))) >> Lo; + return Hi == 63 ? Val >> Lo : (Val & ((((uint64_t)1 << (Hi + 1)) - 1))) >> Lo; } /// Apply fixup expression for edge to block content. From cba9c6ac15b462e68cf76d496ba0f832a531db8b Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 3 Jan 2025 12:48:49 +0000 Subject: [PATCH 073/480] [mlir] Fix typo in parameter name annotation comment. Found by ClangTidyBugProne check. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 255b0ba2559ee..2b006430d3817 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1214,7 +1214,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( ValueRange targetMat = buildUnresolvedMaterialization( MaterializationKind::Target, computeInsertPoint(repl), operandLoc, /*valueToMap=*/Value(), /*inputs=*/unpacked, - /*outputType=*/legalTypes, /*originalType=*/origType, + /*outputTypes=*/legalTypes, /*originalType=*/origType, currentTypeConverter); remapped.push_back(targetMat); continue; From b1c195cbd16adbc4dac8f4bc01b8a34e315d3e61 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 3 Jan 2025 07:59:54 -0500 Subject: [PATCH 074/480] [gn] port 27f30029741e (yet another tblgen reorg) --- .../gn/secondary/llvm/utils/TableGen/BUILD.gn | 21 +++---------------- .../llvm/utils/TableGen/Basic/BUILD.gn | 9 +++++++- .../llvm/utils/TableGen/Common/BUILD.gn | 2 +- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index ba52a97f39d85..e2daa1e9b73c2 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -1,29 +1,13 @@ -source_set("llvm-min-tblgen-sources") { - sources = [ - "ARMTargetDefEmitter.cpp", - "Attributes.cpp", - "DirectiveEmitter.cpp", - "IntrinsicEmitter.cpp", - "RISCVTargetDefEmitter.cpp", - "TableGen.cpp", - "VTEmitter.cpp", - ] - deps = [ - "Basic", - "//llvm/lib/Support", - ] -} - executable("llvm-min-tblgen") { + sources = [ "llvm-min-tblgen.cpp" ] deps = [ - ":llvm-min-tblgen-sources", "Basic", + "//llvm/lib/Support", ] } executable("llvm-tblgen") { deps = [ - ":llvm-min-tblgen-sources", "Basic", "Common", "//llvm/include/llvm/Config:llvm-config", @@ -55,6 +39,7 @@ executable("llvm-tblgen") { "GlobalISelEmitter.cpp", "InstrDocsEmitter.cpp", "InstrInfoEmitter.cpp", + "llvm-tblgen.cpp", "MacroFusionPredicatorEmitter.cpp", "OptionParserEmitter.cpp", "OptionRSTEmitter.cpp", diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn index 2ebe393fa0fd9..ef6d6e44b6f8d 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn @@ -1,10 +1,17 @@ -static_library("Basic") { +source_set("Basic") { deps = [ "//llvm/lib/Support", "//llvm/lib/TableGen", ] sources = [ + "ARMTargetDefEmitter.cpp", + "Attributes.cpp", "CodeGenIntrinsics.cpp", + "DirectiveEmitter.cpp", + "IntrinsicEmitter.cpp", + "RISCVTargetDefEmitter.cpp", "SDNodeProperties.cpp", + "TableGen.cpp", + "VTEmitter.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn index c46e7cb1dc8b7..31d0e1dade039 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn @@ -1,4 +1,4 @@ -static_library("Common") { +source_set("Common") { deps = [ "//llvm/include/llvm/CodeGen:GenVT", "//llvm/lib/CodeGenTypes", From d598829375634da42910e2624f181f6b843bdc8b Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 3 Jan 2025 08:05:10 -0500 Subject: [PATCH 075/480] [gn] make LLVMTableGenCommon a static_library again Else TableGenTests doesn't link. --- llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn index 31d0e1dade039..db11e56e550f9 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn @@ -1,4 +1,5 @@ -source_set("Common") { +static_library("Common") { + output_name = "LLVMTableGenCommon" deps = [ "//llvm/include/llvm/CodeGen:GenVT", "//llvm/lib/CodeGenTypes", From e53494c750246118c313b3cbf7479edb682f2208 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 3 Jan 2025 13:07:49 +0000 Subject: [PATCH 076/480] [mlir] Fix 0 values passed to the wrong parameters. This was found by modernize-use-nullptr ClangTidy check, which suggested to pass nullptr instead of 0 to DIFileAttr. However it looks like the intention was to pass the two 0 values for line and scopeLine, and we should pass {} to DIFileAttr. Do that change. --- mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp index 28e8b81a05576..7490e8735f5fd 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp @@ -233,7 +233,7 @@ DIRecursiveTypeAttrInterface DISubprogramAttr::withRecId(DistinctAttr recId) { DIRecursiveTypeAttrInterface DISubprogramAttr::getRecSelf(DistinctAttr recId) { return DISubprogramAttr::get(recId.getContext(), recId, /*isRecSelf=*/true, - {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {}, {}); + {}, {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {}); } //===----------------------------------------------------------------------===// From 119fc720a19e047fee59d7f7446c911b158563e0 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 3 Jan 2025 14:14:00 +0100 Subject: [PATCH 077/480] NFC, explicitly specify the -fopenmp lib in spirv-openmp-toolchain.c test Don't rely on the default `CLANG_DEFAULT_OPENMP_RUNTIME` env variable which is `libomp` by default. --- clang/test/Driver/spirv-openmp-toolchain.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c index 3eb1f22a03ed0..377b2d9be0b09 100644 --- a/clang/test/Driver/spirv-openmp-toolchain.c +++ b/clang/test/Driver/spirv-openmp-toolchain.c @@ -9,7 +9,7 @@ // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-obj" // CHECK: clang-linker-wrapper{{.*}} "-o" "a.out" -// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 \ +// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PHASES %s // CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp) @@ -28,8 +28,8 @@ // CHECK-PHASES: 13: assembler, {12}, object, (host-openmp) // CHECK-PHASES: 14: clang-linker-wrapper, {13}, image, (host-openmp) -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]" // CHECK-BINDINGS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]" @@ -38,8 +38,8 @@ // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_OBJ:.+]]" // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp=libomp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_PP:.+]]" // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_PP]]"], output: "[[HOST_BC:.+]]" // CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]"], output: "[[DEVICE_PP:.+]]" @@ -51,14 +51,14 @@ // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang::as", inputs: ["[[HOST_ASM]]"], output: "[[HOST_OBJ:.+]]" // CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR // CHECK-EMIT-LLVM-IR: "-cc1" "-triple" "spirv64-intel"{{.*}}"-emit-llvm-bc" -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel \ // RUN: --sysroot=%S/Inputs/spirv-openmp/ %s 2>&1 | FileCheck --check-prefix=CHECK-GPULIB %s // CHECK-GPULIB: "-cc1" "-triple" "spirv64-intel"{{.*}}"-mlink-builtin-bitcode" "{{.*}}libomptarget-spirv64.bc" -// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp --offload-arch=spirv64-intel \ +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=spirv64-intel \ // RUN: --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-OFFLOAD-ARCH-ERROR // CHECK-OFFLOAD-ARCH-ERROR: error: failed to deduce triple for target architecture 'spirv64-intel'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead From 6cd171dc3330a055a8d8a1ddff63631d42150b8a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 3 Jan 2025 08:20:06 -0500 Subject: [PATCH 078/480] [lld/COFF] Support thin archives in /reproduce: files (#121512) This already worked without /wholearchive; now it works with it too. (Only for thin archives containing relative file names, matching the ELF and Mach-O ports.) --- lld/COFF/InputFiles.cpp | 8 ++++++++ lld/test/COFF/linkrepro-thin-archives.s | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 lld/test/COFF/linkrepro-thin-archives.s diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index e698f66b84f62..a94c984cfd487 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -149,11 +149,19 @@ std::vector lld::coff::getArchiveMembers(COFFLinkerContext &ctx, Archive *file) { std::vector v; Error err = Error::success(); + + // Thin archives refer to .o files, so --reproduces needs the .o files too. + bool addToTar = file->isThin() && ctx.driver.tar; + for (const Archive::Child &c : file->children(err)) { MemoryBufferRef mbref = CHECK(c.getMemoryBufferRef(), file->getFileName() + ": could not get the buffer for a child of the archive"); + if (addToTar) { + ctx.driver.tar->append(relativeToRoot(check(c.getFullName())), + mbref.getBuffer()); + } v.push_back(mbref); } if (err) diff --git a/lld/test/COFF/linkrepro-thin-archives.s b/lld/test/COFF/linkrepro-thin-archives.s new file mode 100644 index 0000000000000..6fde36b84e0af --- /dev/null +++ b/lld/test/COFF/linkrepro-thin-archives.s @@ -0,0 +1,23 @@ +# REQUIRES: x86 + +# RUN: rm -rf %t.dir; split-file %s %t.dir + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-windows %t.dir/foo.s -o %t.dir/foo.obj +# RUN: cd %t.dir +# RUN: llvm-ar rcsT foo.lib foo.obj + +# RUN: lld-link foo.lib /out:/dev/null /reproduce:repro.tar \ +# RUN: /subsystem:console /machine:x64 +# RUN: tar tf repro.tar | FileCheck -DPATH='repro/%:t.dir' %s + +# RUN: lld-link /wholearchive foo.lib /out:/dev/null /reproduce:repro2.tar \ +# RUN: /subsystem:console /machine:x64 +# RUN: tar tf repro2.tar | FileCheck -DPATH='repro2/%:t.dir' %s + +# CHECK-DAG: [[PATH]]/foo.lib +# CHECK-DAG: [[PATH]]/foo.obj + +#--- foo.s +.globl mainCRTStartup +mainCRTStartup: + nop From e576c5bed79f8a9528391756c8475cc3a6276adf Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 3 Jan 2025 15:13:48 +0100 Subject: [PATCH 079/480] Fix an incorrect -show-graph command-line flag in COFF_comdat_weak_plus_strong.s test The flag -show-graph has been renamed to -show-graphs in 01bdd8cffcaf97636b5fb6ee4933e62c872528d3 --- .../JITLink/x86-64/COFF_comdat_weak_plus_strong.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s index 2754855e428e0..01aac02f5286e 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s @@ -8,7 +8,7 @@ # # RUN: not llvm-jitlink -noexec %t/COFF_main.o %t/COFF_weak_1.o %t/COFF_strong.o \ # RUN: -slab-allocate 64Kb -slab-address 0xfff00000 \ -# RUN: -slab-page-size 4096 -show-graph 2>&1 | FileCheck %s +# RUN: -slab-page-size 4096 -show-graphs=".*" 2>&1 | FileCheck %s # # Check that a combination of comdat any definition and strong definition # generate duplicate definition error. From df859f90aab261918eee26382021e8455b532f7d Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 3 Jan 2025 08:36:34 -0600 Subject: [PATCH 080/480] [flang][OpenMP] Frontend support for NOTHING directive (#120606) Create OpenMPUtilityConstruct and put the two utility directives in it (error and nothing). Rename OpenMPErrorConstruct to OmpErrorDirective. --- .../FlangOmpReport/FlangOmpReportVisitor.cpp | 8 +++-- flang/include/flang/Parser/dump-parse-tree.h | 4 ++- flang/include/flang/Parser/parse-tree.h | 34 ++++++++++++++----- flang/lib/Lower/OpenMP/OpenMP.cpp | 4 +-- flang/lib/Parser/openmp-parsers.cpp | 18 +++++++--- flang/lib/Parser/unparse.cpp | 6 +++- flang/lib/Semantics/check-omp-structure.cpp | 4 +-- flang/lib/Semantics/check-omp-structure.h | 4 +-- flang/test/Lower/OpenMP/Todo/error.f90 | 2 +- flang/test/Parser/OpenMP/error-unparse.f90 | 6 ++-- flang/test/Parser/OpenMP/nothing.f90 | 13 +++++++ 11 files changed, 75 insertions(+), 28 deletions(-) create mode 100644 flang/test/Parser/OpenMP/nothing.f90 diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp index 665b92be00898..231df63bbae92 100644 --- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp +++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp @@ -90,6 +90,10 @@ SourcePosition OpenMPCounterVisitor::getLocation(const OpenMPConstruct &c) { const CharBlock &source{c.source}; return (parsing->allCooked().GetSourcePositionRange(source))->first; }, + [&](const OpenMPUtilityConstruct &c) -> SourcePosition { + const CharBlock &source{c.source}; + return (parsing->allCooked().GetSourcePositionRange(source))->first; + }, }, c.u); } @@ -143,8 +147,8 @@ std::string OpenMPCounterVisitor::getName(const OpenMPConstruct &c) { }, c.u); }, - [&](const OpenMPErrorConstruct &c) -> std::string { - const CharBlock &source{std::get<0>(c.t).source}; + [&](const OpenMPUtilityConstruct &c) -> std::string { + const CharBlock &source{c.source}; return normalize_construct_name(source.ToString()); }, [&](const OpenMPSectionConstruct &c) -> std::string { diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 7821d40a644a2..fa813727442f0 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -516,6 +516,8 @@ class ParseTreeDumper { #include "llvm/Frontend/OpenMP/OMP.inc" NODE(parser, OmpClauseList) NODE(parser, OmpCriticalDirective) + NODE(parser, OmpErrorDirective) + NODE(parser, OmpNothingDirective) NODE(parser, OmpDeclareTargetSpecifier) NODE(parser, OmpDeclareTargetWithClause) NODE(parser, OmpDeclareTargetWithList) @@ -662,7 +664,7 @@ class ParseTreeDumper { NODE(parser, OmpAtomicDefaultMemOrderClause) NODE_ENUM(common, OmpAtomicDefaultMemOrderType) NODE(parser, OpenMPDepobjConstruct) - NODE(parser, OpenMPErrorConstruct) + NODE(parser, OpenMPUtilityConstruct) NODE(parser, OpenMPFlushConstruct) NODE(parser, OpenMPLoopConstruct) NODE(parser, OpenMPExecutableAllocate) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 2ef593b3e50da..9df7c6d5e39c3 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4182,6 +4182,30 @@ struct OmpClauseList { // --- Directives and constructs +// Ref: [5.1:89-90], [5.2:216] +// +// nothing-directive -> +// NOTHING // since 5.1 +struct OmpNothingDirective { + using EmptyTrait = std::true_type; + COPY_AND_ASSIGN_BOILERPLATE(OmpNothingDirective); + CharBlock source; +}; + +// Ref: OpenMP [5.2:216-218] +// ERROR AT(compilation|execution) SEVERITY(fatal|warning) MESSAGE("msg-str) +struct OmpErrorDirective { + TUPLE_CLASS_BOILERPLATE(OmpErrorDirective); + CharBlock source; + std::tuple t; +}; + +struct OpenMPUtilityConstruct { + UNION_CLASS_BOILERPLATE(OpenMPUtilityConstruct); + CharBlock source; + std::variant u; +}; + // 2.7.2 SECTIONS // 2.11.2 PARALLEL SECTIONS struct OmpSectionsDirective { @@ -4506,14 +4530,6 @@ struct OpenMPDepobjConstruct { std::tuple t; }; -// Ref: OpenMP [5.2:216-218] -// ERROR AT(compilation|execution) SEVERITY(fatal|warning) MESSAGE("msg-str) -struct OpenMPErrorConstruct { - TUPLE_CLASS_BOILERPLATE(OpenMPErrorConstruct); - CharBlock source; - std::tuple t; -}; - // 2.17.8 flush -> FLUSH [memory-order-clause] [(variable-name-list)] struct OpenMPFlushConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPFlushConstruct); @@ -4586,7 +4602,7 @@ struct OpenMPConstruct { UNION_CLASS_BOILERPLATE(OpenMPConstruct); std::variant u; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index b07e89d201d19..fe6d82125a9e0 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2907,8 +2907,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPErrorConstruct &) { - TODO(converter.getCurrentLocation(), "OpenMPErrorConstruct"); + const parser::OpenMPUtilityConstruct &) { + TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 67385c03f66c8..0a0a29002de27 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -737,9 +737,20 @@ TYPE_PARSER( TYPE_PARSER(sourced(construct( many(maybe(","_tok) >> sourced(Parser{}))))) -// 2.1 (variable | /common-block | array-sections) +// 2.1 (variable | /common-block/ | array-sections) TYPE_PARSER(construct(nonemptyList(Parser{}))) +TYPE_PARSER(sourced(construct( + verbatim("ERROR"_tok), Parser{}))) + +TYPE_PARSER(sourced(construct("NOTHING" >> ok))) + +TYPE_PARSER(sourced(construct( + sourced(construct( + sourced(Parser{}))) || + sourced(construct( + sourced(Parser{})))))) + // Omp directives enclosing do loop TYPE_PARSER(sourced(construct(first( "DISTRIBUTE PARALLEL DO SIMD" >> @@ -1027,9 +1038,6 @@ TYPE_PARSER(sourced(construct(verbatim("CRITICAL"_tok), TYPE_PARSER(construct( Parser{}, block, Parser{})) -TYPE_PARSER(sourced(construct( - verbatim("ERROR"_tok), Parser{}))) - // 2.11.3 Executable Allocate directive TYPE_PARSER( sourced(construct(verbatim("ALLOCATE"_tok), @@ -1127,7 +1135,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, // OpenMPStandaloneConstruct to resolve !$OMP ORDERED construct(Parser{}), construct(Parser{}), - construct(Parser{}), + construct(Parser{}), construct(Parser{}), construct(Parser{}), construct(Parser{}), diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 0a6af7435b4a2..4fe57f3e348d3 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2710,11 +2710,15 @@ class UnparseVisitor { Walk(x.v); return false; } - void Unparse(const OpenMPErrorConstruct &x) { + void Unparse(const OmpErrorDirective &x) { Word("!$OMP ERROR "); Walk(x.t); Put("\n"); } + void Unparse(const OmpNothingDirective &x) { + Word("!$OMP NOTHING"); + Put("\n"); + } void Unparse(const OmpSectionsDirective &x) { switch (x.v) { case llvm::omp::Directive::OMPD_sections: diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 95b962f5daf57..3a928c8a0289b 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1688,12 +1688,12 @@ void OmpStructureChecker::Leave(const parser::OpenMPDeclareTargetConstruct &x) { dirContext_.pop_back(); } -void OmpStructureChecker::Enter(const parser::OpenMPErrorConstruct &x) { +void OmpStructureChecker::Enter(const parser::OmpErrorDirective &x) { const auto &dir{std::get(x.t)}; PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_error); } -void OmpStructureChecker::Leave(const parser::OpenMPErrorConstruct &x) { +void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) { dirContext_.pop_back(); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 346a7bed9138f..2a4f6fbd618c3 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -102,8 +102,8 @@ class OmpStructureChecker void Enter(const parser::OmpDeclareTargetWithList &); void Enter(const parser::OmpDeclareTargetWithClause &); void Leave(const parser::OmpDeclareTargetWithClause &); - void Enter(const parser::OpenMPErrorConstruct &); - void Leave(const parser::OpenMPErrorConstruct &); + void Enter(const parser::OmpErrorDirective &); + void Leave(const parser::OmpErrorDirective &); void Enter(const parser::OpenMPExecutableAllocate &); void Leave(const parser::OpenMPExecutableAllocate &); void Enter(const parser::OpenMPAllocatorsConstruct &); diff --git a/flang/test/Lower/OpenMP/Todo/error.f90 b/flang/test/Lower/OpenMP/Todo/error.f90 index b97e2c20a0cdf..6d3bd892da47d 100644 --- a/flang/test/Lower/OpenMP/Todo/error.f90 +++ b/flang/test/Lower/OpenMP/Todo/error.f90 @@ -1,6 +1,6 @@ ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -! CHECK: not yet implemented: OpenMPErrorConstruct +! CHECK: not yet implemented: OpenMPUtilityConstruct program p integer, allocatable :: x !$omp error at(compilation) severity(warning) message("an error") diff --git a/flang/test/Parser/OpenMP/error-unparse.f90 b/flang/test/Parser/OpenMP/error-unparse.f90 index fce5d8cf22863..4dd06b736da80 100644 --- a/flang/test/Parser/OpenMP/error-unparse.f90 +++ b/flang/test/Parser/OpenMP/error-unparse.f90 @@ -3,19 +3,19 @@ program main character(*), parameter :: message = "This is an error" !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(WARNING) MESSAGE("some message here") - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Warning !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> LiteralConstant -> CharLiteralConstant !$omp error at(compilation) severity(warning) message("some message here") !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE(message) - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' !$omp error at(compilation) severity(fatal) message(message) !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE(message) - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Execution !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' diff --git a/flang/test/Parser/OpenMP/nothing.f90 b/flang/test/Parser/OpenMP/nothing.f90 new file mode 100644 index 0000000000000..80c0932087610 --- /dev/null +++ b/flang/test/Parser/OpenMP/nothing.f90 @@ -0,0 +1,13 @@ +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00 + !$omp nothing +end + +!UNPARSE: SUBROUTINE f00 +!UNPARSE: !$OMP NOTHING +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: ExecutionPart -> Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective From 62b5cf041059a90215788a0bfefb8fc180fd0b5a Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 3 Jan 2025 06:37:45 -0800 Subject: [PATCH 081/480] [Vectorizer] precommit test for miscompilation (#120731) we generate GEPs that are out of bounds but mark them as "inbound" --- ...bounds-flags-for-reverse-vector-pointer.ll | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll new file mode 100644 index 0000000000000..66bb9357750c8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; FIXME: GEP flags on GEPs for reverse vector pointer need to be dropped when folding the tail. + +define i1 @fn(ptr %nno) #0 { +; CHECK-LABEL: define i1 @fn( +; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 10, [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10) +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 -3 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison) +; CHECK-NEXT: [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: br i1 true, label [[FOR_END36:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -2, [[MIDDLE_BLOCK]] ], [ 10, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY20:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ] +; CHECK-NEXT: [[SUM_01:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC35]] ] +; CHECK-NEXT: [[REM4:%.*]] = and i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i64 [[REM4]], 0 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: br i1 [[CMP21]], label [[IF_THEN22:%.*]], label [[FOR_INC35]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL:%.*]] = shl i32 [[TMP15]], 1 +; CHECK-NEXT: [[REM27:%.*]] = urem i32 [[MUL]], 10 +; CHECK-NEXT: br label [[FOR_INC35]] +; CHECK: loop.latch: +; CHECK-NEXT: [[REM27_PN:%.*]] = phi i32 [ [[REM27]], [[IF_THEN22]] ], [ [[TMP15]], [[FOR_BODY20]] ] +; CHECK-NEXT: [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_END36]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0 +; CHECK-NEXT: ret i1 [[CMP41]] +; +entry: + br label %loop.header + +loop.header: ; preds = %entry, %loop.latch + %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop.latch ] + %sum.01 = phi i32 [ 0, %entry ], [ %sum.1, %loop.latch ] + %rem4 = and i64 %iv, 1 + %cmp21 = icmp eq i64 %rem4, 0 + %gep = getelementptr inbounds nuw i32, ptr %nno, i64 %iv + %0 = load i32, ptr %gep, align 4 + br i1 %cmp21, label %if.then, label %loop.latch + +if.then: ; preds = %loop.header + %mul = shl i32 %0, 1 + %rem27 = urem i32 %mul, 10 + br label %loop.latch + +loop.latch: ; preds = %loop.header, %if.then + %rem27.pn = phi i32 [ %rem27, %if.then ], [ %0, %loop.header ] + %sum.1 = or i32 %rem27.pn, %sum.01 + %iv.next = add nsw i64 %iv, -1 + %cmp19.not = icmp eq i64 %iv, 0 + br i1 %cmp19.not, label %exit, label %loop.header + +exit: ; preds = %loop.latch + %sum.1.lcssa = phi i32 [ %sum.1, %loop.latch ] + %cmp41 = icmp eq i32 %sum.1.lcssa, 0 + ret i1 %cmp41 +} + +attributes #0 = { "target-features"="+avx" } From 9d6527bc12547e28b86d180b76fe934a96aa518e Mon Sep 17 00:00:00 2001 From: Acim Maravic Date: Fri, 3 Jan 2025 15:45:52 +0100 Subject: [PATCH 082/480] [CodeGen] Add MOTargetFlag4 to MachineMemOperand Flags (#120136) --- llvm/include/llvm/CodeGen/MachineMemOperand.h | 3 ++- llvm/lib/CodeGen/MachineOperand.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h index e2343abcc4ac1..2caa3bd30487a 100644 --- a/llvm/include/llvm/CodeGen/MachineMemOperand.h +++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h @@ -152,8 +152,9 @@ class MachineMemOperand { MOTargetFlag1 = 1u << 6, MOTargetFlag2 = 1u << 7, MOTargetFlag3 = 1u << 8, + MOTargetFlag4 = 1u << 9, - LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag3) + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag4) }; private: diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 3a9bdde28a2e7..5c9ca91e784e9 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1170,6 +1170,9 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, if (getFlags() & MachineMemOperand::MOTargetFlag3) OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag3) << "\" "; + if (getFlags() & MachineMemOperand::MOTargetFlag4) + OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag4) + << "\" "; } else { if (getFlags() & MachineMemOperand::MOTargetFlag1) OS << "\"MOTargetFlag1\" "; @@ -1177,6 +1180,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << "\"MOTargetFlag2\" "; if (getFlags() & MachineMemOperand::MOTargetFlag3) OS << "\"MOTargetFlag3\" "; + if (getFlags() & MachineMemOperand::MOTargetFlag4) + OS << "\"MOTargetFlag4\" "; } assert((isLoad() || isStore()) && From 3ace685105d3b50bca68328bf0c945af22d70f23 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 3 Jan 2025 16:11:56 +0100 Subject: [PATCH 083/480] [mlir][Transforms] Support 1:N mappings in `ConversionValueMapping` (#116524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit updates the internal `ConversionValueMapping` data structure in the dialect conversion driver to support 1:N replacements. This is the last major commit for adding 1:N support to the dialect conversion driver. Since #116470, the infrastructure already supports 1:N replacements. But the `ConversionValueMapping` still stored 1:1 value mappings. To that end, the driver inserted temporary argument materializations (converting N SSA values into 1 value). This is no longer the case. Argument materializations are now entirely gone. (They will be deleted from the type converter after some time, when we delete the old 1:N dialect conversion driver.) Note for LLVM integration: Replace all occurrences of `addArgumentMaterialization` (except for 1:N dialect conversion passes) with `addSourceMaterialization`. --------- Co-authored-by: Markus Böck --- .../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 1 - mlir/docs/DialectConversion.md | 35 +- .../mlir/Transforms/DialectConversion.h | 18 +- .../Conversion/LLVMCommon/TypeConverter.cpp | 16 +- .../EmitC/Transforms/TypeConversions.cpp | 1 - .../Dialect/Linalg/Transforms/Detensorize.cpp | 1 - .../Quant/Transforms/StripFuncQuantTypes.cpp | 1 - .../Utils/SparseTensorDescriptor.cpp | 3 - .../Vector/Transforms/VectorLinearize.cpp | 1 - .../Transforms/Utils/DialectConversion.cpp | 477 +++++++++--------- mlir/test/Transforms/test-legalizer.mlir | 16 +- .../Func/TestDecomposeCallGraphTypes.cpp | 2 +- mlir/test/lib/Dialect/Test/TestPatterns.cpp | 9 - .../lib/Transforms/TestDialectConversion.cpp | 1 - 14 files changed, 268 insertions(+), 314 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 1bb91d252529f..104ae7408b80c 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -172,7 +172,6 @@ class BoxprocTypeRewriter : public mlir::TypeConverter { addConversion([&](TypeDescType ty) { return TypeDescType::get(convertType(ty.getOfTy())); }); - addArgumentMaterialization(materializeProcedure); addSourceMaterialization(materializeProcedure); addTargetMaterialization(materializeProcedure); } diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md index 3168f5e13c751..abacd5a82c61e 100644 --- a/mlir/docs/DialectConversion.md +++ b/mlir/docs/DialectConversion.md @@ -242,19 +242,6 @@ cannot. These materializations are used by the conversion framework to ensure type safety during the conversion process. There are several types of materializations depending on the situation. -* Argument Materialization - - - An argument materialization is used when converting the type of a block - argument during a [signature conversion](#region-signature-conversion). - The new block argument types are specified in a `SignatureConversion` - object. An original block argument can be converted into multiple - block arguments, which is not supported everywhere in the dialect - conversion. (E.g., adaptors support only a single replacement value for - each original value.) Therefore, an argument materialization is used to - convert potentially multiple new block arguments back into a single SSA - value. An argument materialization is also used when replacing an op - result with multiple values. - * Source Materialization - A source materialization is used when a value was replaced with a value @@ -343,17 +330,6 @@ class TypeConverter { /// Materialization functions must be provided when a type conversion may /// persist after the conversion has finished. - /// This method registers a materialization that will be called when - /// converting (potentially multiple) block arguments that were the result of - /// a signature conversion of a single block argument, to a single SSA value - /// with the old argument type. - template ::template arg_t<1>> - void addArgumentMaterialization(FnT &&callback) { - argumentMaterializations.emplace_back( - wrapMaterialization(std::forward(callback))); - } - /// This method registers a materialization that will be called when /// converting a replacement value back to its original source type. /// This is used when some uses of the original value persist beyond the main @@ -406,12 +382,11 @@ done explicitly via a conversion pattern. To convert the types of block arguments within a Region, a custom hook on the `ConversionPatternRewriter` must be invoked; `convertRegionTypes`. This hook uses a provided type converter to apply type conversions to all blocks of a -given region. As noted above, the conversions performed by this method use the -argument materialization hook on the `TypeConverter`. This hook also takes an -optional `TypeConverter::SignatureConversion` parameter that applies a custom -conversion to the entry block of the region. The types of the entry block -arguments are often tied semantically to the operation, e.g., -`func::FuncOp`, `AffineForOp`, etc. +given region. This hook also takes an optional +`TypeConverter::SignatureConversion` parameter that applies a custom conversion +to the entry block of the region. The types of the entry block arguments are +often tied semantically to the operation, e.g., `func::FuncOp`, `AffineForOp`, +etc. To convert the signature of just one given block, the `applySignatureConversion` hook can be used. diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 28150e886913e..9a6975dcf8dfa 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -181,6 +181,10 @@ class TypeConverter { /// converting (potentially multiple) block arguments that were the result of /// a signature conversion of a single block argument, to a single SSA value /// with the old block argument type. + /// + /// Note: Argument materializations are used only with the 1:N dialect + /// conversion driver. The 1:N dialect conversion driver will be removed soon + /// and so will be argument materializations. template >::template arg_t<1>> void addArgumentMaterialization(FnT &&callback) { @@ -880,15 +884,7 @@ class ConversionPatternRewriter final : public PatternRewriter { void replaceOp(Operation *op, Operation *newOp) override; /// Replace the given operation with the new value ranges. The number of op - /// results and value ranges must match. If an original SSA value is replaced - /// by multiple SSA values (i.e., a value range has more than 1 element), the - /// conversion driver will insert an argument materialization to convert the - /// N SSA values back into 1 SSA value of the original type. The given - /// operation is erased. - /// - /// Note: The argument materialization is a workaround until we have full 1:N - /// support in the dialect conversion. (It is going to disappear from both - /// `replaceOpWithMultiple` and `applySignatureConversion`.) + /// results and value ranges must match. The given operation is erased. void replaceOpWithMultiple(Operation *op, ArrayRef newValues); /// PatternRewriter hook for erasing a dead operation. The uses of this @@ -1285,8 +1281,8 @@ struct ConversionConfig { // represented at the moment. RewriterBase::Listener *listener = nullptr; - /// If set to "true", the dialect conversion attempts to build source/target/ - /// argument materializations through the type converter API in lieu of + /// If set to "true", the dialect conversion attempts to build source/target + /// materializations through the type converter API in lieu of /// "builtin.unrealized_conversion_cast ops". The conversion process fails if /// at least one materialization could not be built. /// diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp index 49e2d94328664..72799e42cf3fd 100644 --- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp +++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp @@ -85,7 +85,7 @@ static Value unrankedMemRefMaterialization(OpBuilder &builder, UnrankedMemRefType resultType, ValueRange inputs, Location loc, const LLVMTypeConverter &converter) { - // An argument materialization must return a value of type + // A source materialization must return a value of type // `resultType`, so insert a cast from the memref descriptor type // (!llvm.struct) to the original memref type. Value packed = @@ -101,7 +101,7 @@ static Value rankedMemRefMaterialization(OpBuilder &builder, MemRefType resultType, ValueRange inputs, Location loc, const LLVMTypeConverter &converter) { - // An argument materialization must return a value of type `resultType`, + // A source materialization must return a value of type `resultType`, // so insert a cast from the memref descriptor type (!llvm.struct) to the // original memref type. Value packed = @@ -234,19 +234,9 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx, .getResult(0); }); - // Argument materializations convert from the new block argument types + // Source materializations convert from the new block argument types // (multiple SSA values that make up a memref descriptor) back to the // original block argument type. - addArgumentMaterialization([&](OpBuilder &builder, - UnrankedMemRefType resultType, - ValueRange inputs, Location loc) { - return unrankedMemRefMaterialization(builder, resultType, inputs, loc, - *this); - }); - addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType, - ValueRange inputs, Location loc) { - return rankedMemRefMaterialization(builder, resultType, inputs, loc, *this); - }); addSourceMaterialization([&](OpBuilder &builder, UnrankedMemRefType resultType, ValueRange inputs, Location loc) { diff --git a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp index 0b3a494794f3f..72c8fd0f32485 100644 --- a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp +++ b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp @@ -33,7 +33,6 @@ void mlir::populateEmitCSizeTTypeConversions(TypeConverter &converter) { converter.addSourceMaterialization(materializeAsUnrealizedCast); converter.addTargetMaterialization(materializeAsUnrealizedCast); - converter.addArgumentMaterialization(materializeAsUnrealizedCast); } /// Get an unsigned integer or size data type corresponding to \p ty. diff --git a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp index 0e651f4cee4c3..fc6671ef81175 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp @@ -154,7 +154,6 @@ class DetensorizeTypeConverter : public TypeConverter { }); addSourceMaterialization(sourceMaterializationCallback); - addArgumentMaterialization(sourceMaterializationCallback); } }; diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp index 6191272266283..71b88d1be1b05 100644 --- a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp +++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp @@ -56,7 +56,6 @@ class QuantizedTypeConverter : public TypeConverter { addConversion(convertQuantizedType); addConversion(convertTensorType); - addArgumentMaterialization(materializeConversion); addSourceMaterialization(materializeConversion); addTargetMaterialization(materializeConversion); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp index 834e3634cc130..8bbb2cac5efdf 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp @@ -69,9 +69,6 @@ SparseTensorTypeToBufferConverter::SparseTensorTypeToBufferConverter() { // Required by scf.for 1:N type conversion. addSourceMaterialization(materializeTuple); - - // Required as a workaround until we have full 1:N support. - addArgumentMaterialization(materializeTuple); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index 757631944f224..68535ae5a7a5c 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -481,7 +481,6 @@ void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality( return builder.create(loc, type, inputs.front()); }; - typeConverter.addArgumentMaterialization(materializeCast); typeConverter.addSourceMaterialization(materializeCast); typeConverter.addTargetMaterialization(materializeCast); target.markUnknownOpDynamicallyLegal( diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 2b006430d3817..0c5520988eff3 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -11,6 +11,7 @@ #include "mlir/IR/Block.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Dominance.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Iterators.h" #include "mlir/Interfaces/FunctionInterfaces.h" @@ -53,6 +54,55 @@ static void logFailure(llvm::ScopedPrinter &os, StringRef fmt, Args &&...args) { }); } +/// Given two insertion points in the same block, choose the later one. +static OpBuilder::InsertPoint +chooseLaterInsertPointInBlock(OpBuilder::InsertPoint a, + OpBuilder::InsertPoint b) { + assert(a.getBlock() == b.getBlock() && "expected same block"); + Block *block = a.getBlock(); + if (a.getPoint() == block->begin()) + return b; + if (b.getPoint() == block->begin()) + return a; + if (a.getPoint()->isBeforeInBlock(&*b.getPoint())) + return b; + return a; +} + +/// Helper function that chooses the insertion point among the two given ones +/// that is later. +// TODO: Extend DominanceInfo API to work with block iterators. +static OpBuilder::InsertPoint chooseLaterInsertPoint(OpBuilder::InsertPoint a, + OpBuilder::InsertPoint b) { + // Case 1: Fast path: Same block. This is the most common case. + if (LLVM_LIKELY(a.getBlock() == b.getBlock())) + return chooseLaterInsertPointInBlock(a, b); + + // Case 2: Different block, but same region. + if (a.getBlock()->getParent() == b.getBlock()->getParent()) { + DominanceInfo domInfo; + if (domInfo.properlyDominates(a.getBlock(), b.getBlock())) + return b; + if (domInfo.properlyDominates(b.getBlock(), a.getBlock())) + return a; + // Neither of the two blocks dominante each other. + llvm_unreachable("unable to find valid insertion point"); + } + + // Case 3: b's region contains a: choose a. + if (b.getBlock()->getParent()->findAncestorOpInRegion( + *a.getPoint()->getParentOp())) + return a; + + // Case 4: a's region contains b: choose b. + if (a.getBlock()->getParent()->findAncestorOpInRegion( + *b.getPoint()->getParentOp())) + return b; + + // Neither of the two operations contain each other. + llvm_unreachable("unable to find valid insertion point"); +} + /// Helper function that computes an insertion point where the given value is /// defined and can be used without a dominance violation. static OpBuilder::InsertPoint computeInsertPoint(Value value) { @@ -63,11 +113,38 @@ static OpBuilder::InsertPoint computeInsertPoint(Value value) { return OpBuilder::InsertPoint(insertBlock, insertPt); } +/// Helper function that computes an insertion point where the given values are +/// defined and can be used without a dominance violation. +static OpBuilder::InsertPoint computeInsertPoint(ArrayRef vals) { + assert(!vals.empty() && "expected at least one value"); + OpBuilder::InsertPoint pt = computeInsertPoint(vals.front()); + for (Value v : vals.drop_front()) + pt = chooseLaterInsertPoint(pt, computeInsertPoint(v)); + return pt; +} + //===----------------------------------------------------------------------===// // ConversionValueMapping //===----------------------------------------------------------------------===// +/// A vector of SSA values, optimized for the most common case of a single +/// value. +using ValueVector = SmallVector; + namespace { + +/// Helper class to make it possible to use `ValueVector` as a key in DenseMap. +struct ValueVectorMapInfo { + static ValueVector getEmptyKey() { return ValueVector{}; } + static ValueVector getTombstoneKey() { return ValueVector{}; } + static ::llvm::hash_code getHashValue(const ValueVector &val) { + return ::llvm::hash_combine_range(val.begin(), val.end()); + } + static bool isEqual(const ValueVector &LHS, const ValueVector &RHS) { + return LHS == RHS; + } +}; + /// This class wraps a IRMapping to provide recursive lookup /// functionality, i.e. we will traverse if the mapped value also has a mapping. struct ConversionValueMapping { @@ -75,68 +152,129 @@ struct ConversionValueMapping { /// false positives. bool isMappedTo(Value value) const { return mappedTo.contains(value); } - /// Lookup the most recently mapped value with the desired type in the + /// Lookup the most recently mapped values with the desired types in the /// mapping. /// /// Special cases: - /// - If the desired type is "null", simply return the most recently mapped + /// - If the desired type range is empty, simply return the most recently + /// mapped values. + /// - If there is no mapping to the desired types, also return the most + /// recently mapped values. + /// - If there is no mapping for the given values at all, return the given /// value. - /// - If there is no mapping to the desired type, also return the most - /// recently mapped value. - /// - If there is no mapping for the given value at all, return the given - /// value. - Value lookupOrDefault(Value from, Type desiredType = nullptr) const; + ValueVector lookupOrDefault(Value from, TypeRange desiredTypes = {}) const; + + /// Lookup the given value within the map, or return an empty vector if the + /// value is not mapped. If it is mapped, this follows the same behavior + /// as `lookupOrDefault`. + ValueVector lookupOrNull(Value from, TypeRange desiredTypes = {}) const; - /// Lookup a mapped value within the map, or return null if a mapping does not - /// exist. If a mapping exists, this follows the same behavior of - /// `lookupOrDefault`. - Value lookupOrNull(Value from, Type desiredType = nullptr) const; + template + struct IsValueVector : std::is_same, ValueVector> {}; - /// Map a value to the one provided. - void map(Value oldVal, Value newVal) { + /// Map a value vector to the one provided. + template + std::enable_if_t::value && IsValueVector::value> + map(OldVal &&oldVal, NewVal &&newVal) { LLVM_DEBUG({ - for (Value it = newVal; it; it = mapping.lookupOrNull(it)) - assert(it != oldVal && "inserting cyclic mapping"); + ValueVector next(newVal); + while (true) { + assert(next != oldVal && "inserting cyclic mapping"); + auto it = mapping.find(next); + if (it == mapping.end()) + break; + next = it->second; + } }); - mapping.map(oldVal, newVal); - mappedTo.insert(newVal); + for (Value v : newVal) + mappedTo.insert(v); + + mapping[std::forward(oldVal)] = std::forward(newVal); + } + + /// Map a value vector or single value to the one provided. + template + std::enable_if_t::value || + !IsValueVector::value> + map(OldVal &&oldVal, NewVal &&newVal) { + if constexpr (IsValueVector{}) { + map(std::forward(oldVal), ValueVector{newVal}); + } else if constexpr (IsValueVector{}) { + map(ValueVector{oldVal}, std::forward(newVal)); + } else { + map(ValueVector{oldVal}, ValueVector{newVal}); + } } - /// Drop the last mapping for the given value. - void erase(Value value) { mapping.erase(value); } + /// Drop the last mapping for the given values. + void erase(const ValueVector &value) { mapping.erase(value); } private: /// Current value mappings. - IRMapping mapping; + DenseMap mapping; /// All SSA values that are mapped to. May contain false positives. DenseSet mappedTo; }; } // namespace -Value ConversionValueMapping::lookupOrDefault(Value from, - Type desiredType) const { - // Try to find the deepest value that has the desired type. If there is no - // such value, simply return the deepest value. - Value desiredValue; +ValueVector +ConversionValueMapping::lookupOrDefault(Value from, + TypeRange desiredTypes) const { + // Try to find the deepest values that have the desired types. If there is no + // such mapping, simply return the deepest values. + ValueVector desiredValue; + ValueVector current{from}; do { - if (!desiredType || from.getType() == desiredType) - desiredValue = from; + // Store the current value if the types match. + if (TypeRange(current) == desiredTypes) + desiredValue = current; + + // If possible, Replace each value with (one or multiple) mapped values. + ValueVector next; + for (Value v : current) { + auto it = mapping.find({v}); + if (it != mapping.end()) { + llvm::append_range(next, it->second); + } else { + next.push_back(v); + } + } + if (next != current) { + // If at least one value was replaced, continue the lookup from there. + current = std::move(next); + continue; + } - Value mappedValue = mapping.lookupOrNull(from); - if (!mappedValue) + // Otherwise: Check if there is a mapping for the entire vector. Such + // mappings are materializations. (N:M mapping are not supported for value + // replacements.) + // + // Note: From a correctness point of view, materializations do not have to + // be stored (and looked up) in the mapping. But for performance reasons, + // we choose to reuse existing IR (when possible) instead of creating it + // multiple times. + auto it = mapping.find(current); + if (it == mapping.end()) { + // No mapping found: The lookup stops here. break; - from = mappedValue; + } + current = it->second; } while (true); - // If the desired value was found use it, otherwise default to the leaf value. - return desiredValue ? desiredValue : from; + // If the desired values were found use them, otherwise default to the leaf + // values. + // Note: If `desiredTypes` is empty, this function always returns `current`. + return !desiredValue.empty() ? std::move(desiredValue) : std::move(current); } -Value ConversionValueMapping::lookupOrNull(Value from, Type desiredType) const { - Value result = lookupOrDefault(from, desiredType); - if (result == from || (desiredType && result.getType() != desiredType)) - return nullptr; +ValueVector ConversionValueMapping::lookupOrNull(Value from, + TypeRange desiredTypes) const { + ValueVector result = lookupOrDefault(from, desiredTypes); + TypeRange resultTypes(result); + if (result == ValueVector{from} || + (!desiredTypes.empty() && resultTypes != desiredTypes)) + return {}; return result; } @@ -651,10 +789,6 @@ class CreateOperationRewrite : public OperationRewrite { /// The type of materialization. enum MaterializationKind { - /// This materialization materializes a conversion for an illegal block - /// argument type, to the original one. - Argument, - /// This materialization materializes a conversion from an illegal type to a /// legal one. Target, @@ -673,7 +807,7 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { UnrealizedConversionCastOp op, const TypeConverter *converter, MaterializationKind kind, Type originalType, - Value mappedValue); + ValueVector mappedValues); static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() == Kind::UnresolvedMaterialization; @@ -708,9 +842,9 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { /// materializations. Type originalType; - /// The value in the conversion value mapping that is being replaced by the + /// The values in the conversion value mapping that are being replaced by the /// results of this unresolved materialization. - Value mappedValue; + ValueVector mappedValues; }; } // namespace @@ -779,7 +913,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { LogicalResult remapValues(StringRef valueDiagTag, std::optional inputLoc, PatternRewriter &rewriter, ValueRange values, - SmallVector> &remapped); + SmallVector &remapped); /// Return "true" if the given operation is ignored, and does not need to be /// converted. @@ -820,39 +954,14 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// If a cast op was built, it can optionally be returned with the `castOp` /// output argument. /// - /// If `valueToMap` is set to a non-null Value, then that value is mapped to + /// If `valuesToMap` is set to a non-null Value, then that value is mapped to /// the results of the unresolved materialization in the conversion value /// mapping. ValueRange buildUnresolvedMaterialization( MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, - Value valueToMap, ValueRange inputs, TypeRange outputTypes, + ValueVector valuesToMap, ValueRange inputs, TypeRange outputTypes, Type originalType, const TypeConverter *converter, UnrealizedConversionCastOp *castOp = nullptr); - Value buildUnresolvedMaterialization( - MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, - Value valueToMap, ValueRange inputs, Type outputType, Type originalType, - const TypeConverter *converter, - UnrealizedConversionCastOp *castOp = nullptr) { - return buildUnresolvedMaterialization(kind, ip, loc, valueToMap, inputs, - TypeRange(outputType), originalType, - converter, castOp) - .front(); - } - - /// Build an N:1 materialization for the given original value that was - /// replaced with the given replacement values. - /// - /// This is a workaround around incomplete 1:N support in the dialect - /// conversion driver. The conversion mapping can store only 1:1 replacements - /// and the conversion patterns only support single Value replacements in the - /// adaptor, so N values must be converted back to a single value. This - /// function will be deleted when full 1:N support has been added. - /// - /// This function inserts an argument materialization back to the original - /// type. - void insertNTo1Materialization(OpBuilder::InsertPoint ip, Location loc, - ValueRange replacements, Value originalValue, - const TypeConverter *converter); /// Find a replacement value for the given SSA value in the conversion value /// mapping. The replacement value must have the same type as the given SSA @@ -862,16 +971,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { Value findOrBuildReplacementValue(Value value, const TypeConverter *converter); - /// Unpack an N:1 materialization and return the inputs of the - /// materialization. This function unpacks only those materializations that - /// were built with `insertNTo1Materialization`. - /// - /// This is a workaround around incomplete 1:N support in the dialect - /// conversion driver. It allows us to write 1:N conversion patterns while - /// 1:N support is still missing in the conversion value mapping. This - /// function will be deleted when full 1:N support has been added. - SmallVector unpackNTo1Materialization(Value value); - //===--------------------------------------------------------------------===// // Rewriter Notification Hooks //===--------------------------------------------------------------------===// @@ -1041,7 +1140,7 @@ void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) { }); } -void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); } +void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase({arg}); } void ReplaceOperationRewrite::commit(RewriterBase &rewriter) { auto *listener = @@ -1082,7 +1181,7 @@ void ReplaceOperationRewrite::commit(RewriterBase &rewriter) { void ReplaceOperationRewrite::rollback() { for (auto result : op->getResults()) - rewriterImpl.mapping.erase(result); + rewriterImpl.mapping.erase({result}); } void ReplaceOperationRewrite::cleanup(RewriterBase &rewriter) { @@ -1101,18 +1200,18 @@ void CreateOperationRewrite::rollback() { UnresolvedMaterializationRewrite::UnresolvedMaterializationRewrite( ConversionPatternRewriterImpl &rewriterImpl, UnrealizedConversionCastOp op, const TypeConverter *converter, MaterializationKind kind, Type originalType, - Value mappedValue) + ValueVector mappedValues) : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op), converterAndKind(converter, kind), originalType(originalType), - mappedValue(mappedValue) { + mappedValues(std::move(mappedValues)) { assert((!originalType || kind == MaterializationKind::Target) && "original type is valid only for target materializations"); rewriterImpl.unresolvedMaterializations[op] = this; } void UnresolvedMaterializationRewrite::rollback() { - if (mappedValue) - rewriterImpl.mapping.erase(mappedValue); + if (!mappedValues.empty()) + rewriterImpl.mapping.erase(mappedValues); rewriterImpl.unresolvedMaterializations.erase(getOperation()); rewriterImpl.nTo1TempMaterializations.erase(getOperation()); op->erase(); @@ -1160,7 +1259,7 @@ void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) { LogicalResult ConversionPatternRewriterImpl::remapValues( StringRef valueDiagTag, std::optional inputLoc, PatternRewriter &rewriter, ValueRange values, - SmallVector> &remapped) { + SmallVector &remapped) { remapped.reserve(llvm::size(values)); for (const auto &it : llvm::enumerate(values)) { @@ -1168,18 +1267,11 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( Type origType = operand.getType(); Location operandLoc = inputLoc ? *inputLoc : operand.getLoc(); - // Find the most recently mapped value. Unpack all temporary N:1 - // materializations. Such conversions are a workaround around missing - // 1:N support in the ConversionValueMapping. (The conversion patterns - // already support 1:N replacements.) - Value repl = mapping.lookupOrDefault(operand); - SmallVector unpacked = unpackNTo1Materialization(repl); - if (!currentTypeConverter) { // The current pattern does not have a type converter. I.e., it does not // distinguish between legal and illegal types. For each operand, simply - // pass through the most recently mapped value. - remapped.push_back(std::move(unpacked)); + // pass through the most recently mapped values. + remapped.push_back(mapping.lookupOrDefault(operand)); continue; } @@ -1192,51 +1284,28 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( }); return failure(); } - // If a type is converted to 0 types, there is nothing to do. if (legalTypes.empty()) { remapped.push_back({}); continue; } - if (legalTypes.size() != 1) { - // TODO: This is a 1:N conversion. The conversion value mapping does not - // store such materializations yet. If the types of the most recently - // mapped values do not match, build a target materialization. - ValueRange unpackedRange(unpacked); - if (TypeRange(unpackedRange) == legalTypes) { - remapped.push_back(std::move(unpacked)); - continue; - } - - // Insert a target materialization if the current pattern expects - // different legalized types. - ValueRange targetMat = buildUnresolvedMaterialization( - MaterializationKind::Target, computeInsertPoint(repl), operandLoc, - /*valueToMap=*/Value(), /*inputs=*/unpacked, - /*outputTypes=*/legalTypes, /*originalType=*/origType, - currentTypeConverter); - remapped.push_back(targetMat); + ValueVector repl = mapping.lookupOrDefault(operand, legalTypes); + if (!repl.empty() && TypeRange(repl) == legalTypes) { + // Mapped values have the correct type or there is an existing + // materialization. Or the operand is not mapped at all and has the + // correct type. + remapped.push_back(std::move(repl)); continue; } - // Handle 1->1 type conversions. - Type desiredType = legalTypes.front(); - // Try to find a mapped value with the desired type. (Or the operand itself - // if the value is not mapped at all.) - Value newOperand = mapping.lookupOrDefault(operand, desiredType); - if (newOperand.getType() != desiredType) { - // If the looked up value's type does not have the desired type, it means - // that the value was replaced with a value of different type and no - // target materialization was created yet. - Value castValue = buildUnresolvedMaterialization( - MaterializationKind::Target, computeInsertPoint(newOperand), - operandLoc, /*valueToMap=*/newOperand, /*inputs=*/unpacked, - /*outputType=*/desiredType, /*originalType=*/origType, - currentTypeConverter); - newOperand = castValue; - } - remapped.push_back({newOperand}); + // Create a materialization for the most recently mapped values. + repl = mapping.lookupOrDefault(operand); + ValueRange castValues = buildUnresolvedMaterialization( + MaterializationKind::Target, computeInsertPoint(repl), operandLoc, + /*valuesToMap=*/repl, /*inputs=*/repl, /*outputTypes=*/legalTypes, + /*originalType=*/origType, currentTypeConverter); + remapped.push_back(castValues); } return success(); } @@ -1353,7 +1422,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( buildUnresolvedMaterialization( MaterializationKind::Source, OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(), - /*valueToMap=*/origArg, /*inputs=*/ValueRange(), + /*valuesToMap=*/{origArg}, /*inputs=*/ValueRange(), /*outputType=*/origArgType, /*originalType=*/Type(), converter); appendRewrite(block, origArg, converter); continue; @@ -1369,19 +1438,11 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( continue; } - // This is a 1->1+ mapping. 1->N mappings are not fully supported in the - // dialect conversion. Therefore, we need an argument materialization to - // turn the replacement block arguments into a single SSA value that can be - // used as a replacement. + // This is a 1->1+ mapping. auto replArgs = newBlock->getArguments().slice(inputMap->inputNo, inputMap->size); - if (replArgs.size() == 1) { - mapping.map(origArg, replArgs.front()); - } else { - insertNTo1Materialization( - OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(), - /*replacements=*/replArgs, /*outputValue=*/origArg, converter); - } + ValueVector replArgVals = llvm::to_vector_of(replArgs); + mapping.map(origArg, std::move(replArgVals)); appendRewrite(block, origArg, converter); } @@ -1402,7 +1463,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( /// of input operands. ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, - Value valueToMap, ValueRange inputs, TypeRange outputTypes, + ValueVector valuesToMap, ValueRange inputs, TypeRange outputTypes, Type originalType, const TypeConverter *converter, UnrealizedConversionCastOp *castOp) { assert((!originalType || kind == MaterializationKind::Target) && @@ -1410,10 +1471,8 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( // Avoid materializing an unnecessary cast. if (TypeRange(inputs) == outputTypes) { - if (valueToMap) { - assert(inputs.size() == 1 && "1:N mapping is not supported"); - mapping.map(valueToMap, inputs.front()); - } + if (!valuesToMap.empty()) + mapping.map(std::move(valuesToMap), inputs); return inputs; } @@ -1423,37 +1482,21 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( builder.setInsertionPoint(ip.getBlock(), ip.getPoint()); auto convertOp = builder.create(loc, outputTypes, inputs); - if (valueToMap) { - assert(outputTypes.size() == 1 && "1:N mapping is not supported"); - mapping.map(valueToMap, convertOp.getResult(0)); - } + if (!valuesToMap.empty()) + mapping.map(valuesToMap, convertOp.getResults()); if (castOp) *castOp = convertOp; - appendRewrite(convertOp, converter, kind, - originalType, valueToMap); + appendRewrite( + convertOp, converter, kind, originalType, std::move(valuesToMap)); return convertOp.getResults(); } -void ConversionPatternRewriterImpl::insertNTo1Materialization( - OpBuilder::InsertPoint ip, Location loc, ValueRange replacements, - Value originalValue, const TypeConverter *converter) { - // Insert argument materialization back to the original type. - Type originalType = originalValue.getType(); - UnrealizedConversionCastOp argCastOp; - buildUnresolvedMaterialization( - MaterializationKind::Argument, ip, loc, /*valueToMap=*/originalValue, - /*inputs=*/replacements, originalType, - /*originalType=*/Type(), converter, &argCastOp); - if (argCastOp) - nTo1TempMaterializations.insert(argCastOp); -} - Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( Value value, const TypeConverter *converter) { // Find a replacement value with the same type. - Value repl = mapping.lookupOrNull(value, value.getType()); - if (repl) - return repl; + ValueVector repl = mapping.lookupOrNull(value, value.getType()); + if (!repl.empty()) + return repl.front(); // Check if the value is dead. No replacement value is needed in that case. // This is an approximate check that may have false negatives but does not @@ -1468,7 +1511,7 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( // (regardless of the type) and build a source materialization to the // original type. repl = mapping.lookupOrNull(value); - if (!repl) { + if (repl.empty()) { // No replacement value is registered in the mapping. This means that the // value is dropped and no longer needed. (If the value were still needed, // a source materialization producing a replacement value "out of thin air" @@ -1476,36 +1519,29 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( // `applySignatureConversion`.) return Value(); } - Value castValue = buildUnresolvedMaterialization( - MaterializationKind::Source, computeInsertPoint(repl), value.getLoc(), - /*valueToMap=*/value, /*inputs=*/repl, /*outputType=*/value.getType(), - /*originalType=*/Type(), converter); + + // Note: `computeInsertPoint` computes the "earliest" insertion point at + // which all values in `repl` are defined. It is important to emit the + // materialization at that location because the same materialization may be + // reused in a different context. (That's because materializations are cached + // in the conversion value mapping.) The insertion point of the + // materialization must be valid for all future users that may be created + // later in the conversion process. + // + // Note: Instead of creating new IR, `buildUnresolvedMaterialization` may + // return an already existing, cached materialization from the conversion + // value mapping. + Value castValue = + buildUnresolvedMaterialization(MaterializationKind::Source, + computeInsertPoint(repl), value.getLoc(), + /*valuesToMap=*/{value}, /*inputs=*/repl, + /*outputType=*/value.getType(), + /*originalType=*/Type(), converter) + .front(); mapping.map(value, castValue); return castValue; } -SmallVector -ConversionPatternRewriterImpl::unpackNTo1Materialization(Value value) { - // Unpack unrealized_conversion_cast ops that were inserted as a N:1 - // workaround. - auto castOp = value.getDefiningOp(); - if (!castOp) - return {value}; - if (!nTo1TempMaterializations.contains(castOp)) - return {value}; - assert(castOp->getNumResults() == 1 && "expected single result"); - - SmallVector result; - for (Value v : castOp.getOperands()) { - // Keep unpacking if possible. This is needed because during block - // signature conversions and 1:N op replacements, the driver may have - // inserted two materializations back-to-back: first an argument - // materialization, then a target materialization. - llvm::append_range(result, unpackNTo1Materialization(v)); - } - return result; -} - //===----------------------------------------------------------------------===// // Rewriter Notification Hooks @@ -1554,7 +1590,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced( // Materialize a replacement value "out of thin air". buildUnresolvedMaterialization( MaterializationKind::Source, computeInsertPoint(result), - result.getLoc(), /*valueToMap=*/result, /*inputs=*/ValueRange(), + result.getLoc(), /*valuesToMap=*/{result}, /*inputs=*/ValueRange(), /*outputType=*/result.getType(), /*originalType=*/Type(), currentTypeConverter); continue; @@ -1572,16 +1608,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced( // Remap result to replacement value. if (repl.empty()) continue; - - if (repl.size() == 1) { - // Single replacement value: replace directly. - mapping.map(result, repl.front()); - } else { - // Multiple replacement values: insert N:1 materialization. - insertNTo1Materialization(computeInsertPoint(result), result.getLoc(), - /*replacements=*/repl, /*outputValue=*/result, - currentTypeConverter); - } + mapping.map(result, repl); } appendRewrite(op, currentTypeConverter); @@ -1660,8 +1687,13 @@ void ConversionPatternRewriter::replaceOp(Operation *op, ValueRange newValues) { << "** Replace : '" << op->getName() << "'(" << op << ")\n"; }); SmallVector newVals; - for (size_t i = 0; i < newValues.size(); ++i) - newVals.push_back(newValues.slice(i, 1)); + for (size_t i = 0; i < newValues.size(); ++i) { + if (newValues[i]) { + newVals.push_back(newValues.slice(i, 1)); + } else { + newVals.push_back(ValueRange()); + } + } impl->notifyOpReplaced(op, newVals); } @@ -1733,7 +1765,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from, } Value ConversionPatternRewriter::getRemappedValue(Value key) { - SmallVector> remappedValues; + SmallVector remappedValues; if (failed(impl->remapValues("value", /*inputLoc=*/std::nullopt, *this, key, remappedValues))) return nullptr; @@ -1746,7 +1778,7 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys, SmallVectorImpl &results) { if (keys.empty()) return success(); - SmallVector> remapped; + SmallVector remapped; if (failed(impl->remapValues("value", /*inputLoc=*/std::nullopt, *this, keys, remapped))) return failure(); @@ -1872,7 +1904,7 @@ ConversionPattern::matchAndRewrite(Operation *op, getTypeConverter()); // Remap the operands of the operation. - SmallVector> remapped; + SmallVector remapped; if (failed(rewriterImpl.remapValues("operand", op->getLoc(), rewriter, op->getOperands(), remapped))) { return failure(); @@ -2625,19 +2657,6 @@ legalizeUnresolvedMaterialization(RewriterBase &rewriter, rewriter.setInsertionPoint(op); SmallVector newMaterialization; switch (rewrite->getMaterializationKind()) { - case MaterializationKind::Argument: { - // Try to materialize an argument conversion. - assert(op->getNumResults() == 1 && "expected single result"); - Value argMat = converter->materializeArgumentConversion( - rewriter, op->getLoc(), op.getResultTypes().front(), inputOperands); - if (argMat) { - newMaterialization.push_back(argMat); - break; - } - } - // If an argument materialization failed, fallback to trying a target - // materialization. - [[fallthrough]]; case MaterializationKind::Target: newMaterialization = converter->materializeTargetConversion( rewriter, op->getLoc(), op.getResultTypes(), inputOperands, diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 297eb5acef21b..ae7d344b7167f 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -64,9 +64,6 @@ func.func @remap_call_1_to_1(%arg0: i64) { // Contents of the old block are moved to the new block. // CHECK-NEXT: notifyOperationInserted: test.return, was linked, exact position unknown -// The new block arguments are used in "test.return". -// CHECK-NEXT: notifyOperationModified: test.return - // The old block is erased. // CHECK-NEXT: notifyBlockErased @@ -390,8 +387,8 @@ func.func @caller() { // CHECK: %[[call:.*]]:2 = call @callee() : () -> (f16, f16) %0:2 = func.call @callee() : () -> (f32, i24) - // CHECK: %[[cast1:.*]] = "test.cast"() : () -> i24 - // CHECK: %[[cast0:.*]] = "test.cast"(%[[call]]#0, %[[call]]#1) : (f16, f16) -> f32 + // CHECK-DAG: %[[cast1:.*]] = "test.cast"() : () -> i24 + // CHECK-DAG: %[[cast0:.*]] = "test.cast"(%[[call]]#0, %[[call]]#1) : (f16, f16) -> f32 // CHECK: "test.some_user"(%[[cast0]], %[[cast1]]) : (f32, i24) -> () // expected-remark @below{{'test.some_user' is not legalizable}} "test.some_user"(%0#0, %0#1) : (f32, i24) -> () @@ -494,13 +491,8 @@ func.func @test_1_to_n_block_signature_conversion() { // CHECK-LABEL: func @test_multiple_1_to_n_replacement() // CHECK: %[[legal_op:.*]]:4 = "test.legal_op"() : () -> (f16, f16, f16, f16) -// TODO: There should be a single cast (i.e., a single target materialization). -// This is currently not possible due to 1:N limitations of the conversion -// mapping. Instead, we have 3 argument materializations. -// CHECK: %[[cast1:.*]] = "test.cast"(%[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16) -> f16 -// CHECK: %[[cast2:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1) : (f16, f16) -> f16 -// CHECK: %[[cast3:.*]] = "test.cast"(%[[cast2]], %[[cast1]]) : (f16, f16) -> f16 -// CHECK: "test.valid"(%[[cast3]]) : (f16) -> () +// CHECK: %[[cast:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1, %[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16, f16, f16) -> f16 +// CHECK: "test.valid"(%[[cast]]) : (f16) -> () func.func @test_multiple_1_to_n_replacement() { %0 = "test.multiple_1_to_n_replacement"() : () -> (f16) "test.invalid"(%0) : (f16) -> () diff --git a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp index 09c5b4b2a0ad5..d0b62e71ab0cf 100644 --- a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp +++ b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp @@ -139,7 +139,7 @@ struct TestDecomposeCallGraphTypes tupleType.getFlattenedTypes(types); return success(); }); - typeConverter.addArgumentMaterialization(buildMakeTupleOp); + typeConverter.addSourceMaterialization(buildMakeTupleOp); typeConverter.addTargetMaterialization(buildDecomposeTuple); populateFunctionOpInterfaceTypeConversionPattern( diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 826c222990be4..5b7c36c9b97bf 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1264,14 +1264,6 @@ class TestMultiple1ToNReplacement : public ConversionPattern { // Replace test.multiple_1_to_n_replacement with test.step_1. Operation *repl1 = replaceWithDoubleResults(op, "test.step_1"); // Now replace test.step_1 with test.legal_op. - // TODO: Ideally, it should not be necessary to reset the insertion point - // here. Based on the API calls, it looks like test.step_1 is entirely - // erased. But that's not the case: an argument materialization will - // survive. And that argument materialization will be used by the users of - // `op`. If we don't reset the insertion point here, we get dominance - // errors. This will be fixed when we have 1:N support in the conversion - // value mapping. - rewriter.setInsertionPoint(repl1); replaceWithDoubleResults(repl1, "test.legal_op"); return success(); } @@ -1284,7 +1276,6 @@ struct TestTypeConverter : public TypeConverter { using TypeConverter::TypeConverter; TestTypeConverter() { addConversion(convertType); - addArgumentMaterialization(materializeCast); addSourceMaterialization(materializeCast); } diff --git a/mlir/test/lib/Transforms/TestDialectConversion.cpp b/mlir/test/lib/Transforms/TestDialectConversion.cpp index 2cc1fb5d39d78..a03bf0a1023d5 100644 --- a/mlir/test/lib/Transforms/TestDialectConversion.cpp +++ b/mlir/test/lib/Transforms/TestDialectConversion.cpp @@ -28,7 +28,6 @@ namespace { struct PDLLTypeConverter : public TypeConverter { PDLLTypeConverter() { addConversion(convertType); - addArgumentMaterialization(materializeCast); addSourceMaterialization(materializeCast); } From 68d265666e708bad1c63b419b6275aaba1a7dcd2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 3 Jan 2025 16:15:27 +0100 Subject: [PATCH 084/480] [clang][NFC][docs] Fix typo in LanguageExtensions (#121576) --- clang/docs/LanguageExtensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index cc5f1d4ddf447..e020710c7aa4f 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -3641,7 +3641,7 @@ program location should be executed. It is expected to be used to implement `_ intrinsic. -The ``__builtin_allow_runtime_check()`` can be used within constrol structures +The ``__builtin_allow_runtime_check()`` can be used within control structures like ``if`` to guard expensive runtime checks. The return value is determined by the following compiler options and may differ per call site: From a4d92400a6db9566d84cb4b900149e36e117f452 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 3 Jan 2025 23:19:57 +0800 Subject: [PATCH 085/480] [InstCombine] Fix GEPNoWrapFlags propagation in `foldGEPOfPhi` (#121572) Closes https://github.com/llvm/llvm-project/issues/121459. --- .../InstCombine/InstructionCombining.cpp | 5 ++ .../test/Transforms/InstCombine/opaque-ptr.ll | 58 +++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 934156f04f7fd..f63de1f0d410e 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2782,6 +2782,7 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, // loop iteration). if (Op1 == &GEP) return nullptr; + GEPNoWrapFlags NW = Op1->getNoWrapFlags(); int DI = -1; @@ -2838,6 +2839,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, } } } + + NW &= Op2->getNoWrapFlags(); } // If not all GEPs are identical we'll have to create a new PHI node. @@ -2847,6 +2850,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, return nullptr; auto *NewGEP = cast(Op1->clone()); + NewGEP->setNoWrapFlags(NW); + if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll index bac51c82f36dd..b05274658e812 100644 --- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll +++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll @@ -654,6 +654,64 @@ join: ret ptr %gep } +define ptr @gep_of_phi_of_gep_flags1(i1 %c, ptr %p) { +; CHECK-LABEL: @gep_of_phi_of_gep_flags1( +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[JOIN:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 +; CHECK-NEXT: ret ptr [[GEP]] +; + br i1 %c, label %if, label %else + +if: + %gep1 = getelementptr inbounds i32, ptr %p, i64 1 + br label %join + +else: + %gep2 = getelementptr i32, ptr %p, i64 2 + br label %join + +join: + %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ] + %gep = getelementptr i32, ptr %phi, i64 1 + ret ptr %gep +} + +define ptr @gep_of_phi_of_gep_flags2(i1 %c, ptr %p) { +; CHECK-LABEL: @gep_of_phi_of_gep_flags2( +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[JOIN:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr nuw i8, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4 +; CHECK-NEXT: ret ptr [[GEP]] +; + br i1 %c, label %if, label %else + +if: + %gep1 = getelementptr nuw i32, ptr %p, i64 1 + br label %join + +else: + %gep2 = getelementptr nuw i32, ptr %p, i64 2 + br label %join + +join: + %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ] + %gep = getelementptr i32, ptr %phi, i64 1 + ret ptr %gep +} + define ptr @gep_of_phi_of_gep_different_type(i1 %c, ptr %p) { ; CHECK-LABEL: @gep_of_phi_of_gep_different_type( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] From adeff9f63a24f60b0bf240bf13e40bbf7c1dd0e8 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 3 Jan 2025 09:21:36 -0600 Subject: [PATCH 086/480] [flang][OpenMP] Allow utility constructs in specification part (#121509) Allow utility constructs (error and nothing) to appear in the specification part as well as the execution part. The exception is "ERROR AT(EXECUTION)" which should only be in the execution part. In case of ambiguity (the boundary between the specification and the execution part), utility constructs will be parsed as belonging to the specification part. In such cases move them to the execution part in the OpenMP canonicalization code. --- .../FlangOmpReport/FlangOmpReportVisitor.cpp | 14 +- flang/include/flang/Parser/parse-tree.h | 2 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 4 + flang/lib/Parser/openmp-parsers.cpp | 4 +- flang/lib/Parser/unparse.cpp | 101 +++++------ flang/lib/Semantics/canonicalize-omp.cpp | 162 ++++++++++++++++++ flang/lib/Semantics/check-omp-structure.cpp | 19 +- flang/lib/Semantics/check-omp-structure.h | 8 +- flang/test/Parser/OpenMP/error-unparse.f90 | 18 +- flang/test/Parser/OpenMP/nothing.f90 | 100 +++++++++++ flang/test/Semantics/OpenMP/error.f90 | 8 + 11 files changed, 365 insertions(+), 75 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/error.f90 diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp index 231df63bbae92..c78dd7f14e503 100644 --- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp +++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp @@ -106,10 +106,16 @@ std::string OpenMPCounterVisitor::getName(const OmpWrapperType &w) { return getName(*std::get(w)); } std::string OpenMPCounterVisitor::getName(const OpenMPDeclarativeConstruct &c) { - return std::visit( - [&](const auto &o) -> std::string { - const CharBlock &source{std::get(o.t).source}; - return normalize_construct_name(source.ToString()); + return std::visit( // + Fortran::common::visitors{ + [&](const OpenMPUtilityConstruct &o) -> std::string { + const CharBlock &source{o.source}; + return normalize_construct_name(source.ToString()); + }, + [&](const auto &o) -> std::string { + const CharBlock &source{std::get(o.t).source}; + return normalize_construct_name(source.ToString()); + }, }, c.u); } diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 9df7c6d5e39c3..b693e001e5e4b 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4342,7 +4342,7 @@ struct OpenMPDeclarativeConstruct { std::variant + OpenMPRequiresConstruct, OpenMPUtilityConstruct> u; }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index fe6d82125a9e0..0a84162291573 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2586,6 +2586,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter, //===----------------------------------------------------------------------===// // OpenMPDeclarativeConstruct visitors //===----------------------------------------------------------------------===// +static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + const parser::OpenMPUtilityConstruct &); static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 0a0a29002de27..75bb64d06ed0f 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -1090,7 +1090,9 @@ TYPE_PARSER(startOmpLine >> construct( Parser{}) || construct( - Parser{})) / + Parser{}) || + construct( + Parser{})) / endOmpLine)) // Block Construct diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 4fe57f3e348d3..58820476c51bc 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2631,81 +2631,64 @@ class UnparseVisitor { } } void Unparse(const OpenMPDeclareReductionConstruct &x) { + BeginOpenMP(); + Word("!$OMP DECLARE REDUCTION "); Put("("); Walk(std::get(x.t)), Put(" : "); Walk(std::get>(x.t), ","), Put(" : "); Walk(std::get(x.t)); Put(")"); Walk(std::get>(x.t)); + EndOpenMP(); } - bool Pre(const OpenMPDeclarativeConstruct &x) { + + void Unparse(const OpenMPDeclareMapperConstruct &z) { BeginOpenMP(); - Word("!$OMP "); - return common::visit( - common::visitors{ - [&](const OpenMPDeclarativeAllocate &z) { - Word("ALLOCATE ("); - Walk(std::get(z.t)); - Put(")"); - Walk(std::get(z.t)); - Put("\n"); - EndOpenMP(); - return false; - }, - [&](const OpenMPDeclareMapperConstruct &z) { - Word("DECLARE MAPPER ("); - const auto &spec{std::get(z.t)}; - if (auto mapname{std::get>(spec.t)}) { - Walk(mapname); - Put(":"); - } - Walk(std::get(spec.t)); - Put("::"); - Walk(std::get(spec.t)); - Put(")"); + Word("!$OMP DECLARE MAPPER ("); + const auto &spec{std::get(z.t)}; + if (auto mapname{std::get>(spec.t)}) { + Walk(mapname); + Put(":"); + } + Walk(std::get(spec.t)); + Put("::"); + Walk(std::get(spec.t)); + Put(")"); - Walk(std::get(z.t)); - Put("\n"); - return false; - }, - [&](const OpenMPDeclareReductionConstruct &) { - Word("DECLARE REDUCTION "); - return true; - }, - [&](const OpenMPDeclareSimdConstruct &y) { - Word("DECLARE SIMD "); - Walk("(", std::get>(y.t), ")"); - Walk(std::get(y.t)); - Put("\n"); - EndOpenMP(); - return false; - }, - [&](const OpenMPDeclareTargetConstruct &) { - Word("DECLARE TARGET "); - return true; - }, - [&](const OpenMPRequiresConstruct &y) { - Word("REQUIRES "); - Walk(std::get(y.t)); - Put("\n"); - EndOpenMP(); - return false; - }, - [&](const OpenMPThreadprivate &) { - Word("THREADPRIVATE ("); - return true; - }, - }, - x.u); + Walk(std::get(z.t)); + Put("\n"); + EndOpenMP(); + } + void Unparse(const OpenMPDeclareSimdConstruct &y) { + BeginOpenMP(); + Word("!$OMP DECLARE SIMD "); + Walk("(", std::get>(y.t), ")"); + Walk(std::get(y.t)); + Put("\n"); + EndOpenMP(); } - void Post(const OpenMPDeclarativeConstruct &) { + void Unparse(const OpenMPDeclareTargetConstruct &x) { + BeginOpenMP(); + Word("!$OMP DECLARE TARGET "); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); } - void Post(const OpenMPThreadprivate &) { + void Unparse(const OpenMPRequiresConstruct &y) { + BeginOpenMP(); + Word("!$OMP REQUIRES "); + Walk(std::get(y.t)); + Put("\n"); + EndOpenMP(); + } + void Unparse(const OpenMPThreadprivate &x) { + BeginOpenMP(); + Word("!$OMP THREADPRIVATE ("); + Walk(std::get(x.t)); Put(")\n"); EndOpenMP(); } + bool Pre(const OmpMessageClause &x) { Walk(x.v); return false; diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index 0481b3d41f501..5164f1dc6faab 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -50,6 +50,43 @@ class CanonicalizationOfOmp { void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); } + // Pre-visit all constructs that have both a specification part and + // an execution part, and store the connection between the two. + bool Pre(parser::BlockConstruct &x) { + auto *spec = &std::get(x.t).v; + auto *block = &std::get(x.t); + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::MainProgram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::FunctionSubprogram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::SubroutineSubprogram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + bool Pre(parser::SeparateModuleSubprogram &x) { + auto *spec = &std::get(x.t); + auto *block = &std::get(x.t).v; + blockForSpec_.insert(std::make_pair(spec, block)); + return true; + } + + void Post(parser::SpecificationPart &spec) { + CanonicalizeUtilityConstructs(spec); + } + private: template T *GetConstructIf(parser::ExecutionPartConstruct &x) { if (auto *y{std::get_if(&x.u)}) { @@ -155,6 +192,131 @@ class CanonicalizationOfOmp { } } + // Canonicalization of utility constructs. + // + // This addresses the issue of utility constructs that appear at the + // boundary between the specification and the execution parts, e.g. + // subroutine foo + // integer :: x ! Specification + // !$omp nothing + // x = 1 ! Execution + // ... + // end + // + // Utility constructs (error and nothing) can appear in both the + // specification part and the execution part, except "error at(execution)", + // which cannot be present in the specification part (whereas any utility + // construct can be in the execution part). + // When a utility construct is at the boundary, it should preferably be + // parsed as an element of the execution part, but since the specification + // part is parsed first, the utility construct ends up belonging to the + // specification part. + // + // To allow the likes of the following code to compile, move all utility + // construct that are at the end of the specification part to the beginning + // of the execution part. + // + // subroutine foo + // !$omp error at(execution) ! Initially parsed as declarative construct. + // ! Move it to the execution part. + // end + + void CanonicalizeUtilityConstructs(parser::SpecificationPart &spec) { + auto found = blockForSpec_.find(&spec); + if (found == blockForSpec_.end()) { + // There is no corresponding execution part, so there is nothing to do. + return; + } + parser::Block &block = *found->second; + + // There are two places where an OpenMP declarative construct can + // show up in the tuple in specification part: + // (1) in std::list, or + // (2) in std::list. + // The case (1) is only possible is the list (2) is empty. + + auto &omps = + std::get>(spec.t); + auto &decls = std::get>(spec.t); + + if (!decls.empty()) { + MoveUtilityConstructsFromDecls(decls, block); + } else { + MoveUtilityConstructsFromOmps(omps, block); + } + } + + void MoveUtilityConstructsFromDecls( + std::list &decls, parser::Block &block) { + // Find the trailing range of DeclarationConstructs that are OpenMP + // utility construct, that are to be moved to the execution part. + std::list::reverse_iterator rlast = [&]() { + for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) { + parser::DeclarationConstruct &dc = *rit; + if (!std::holds_alternative(dc.u)) { + return rit; + } + auto &sc = std::get(dc.u); + using OpenMPDeclarativeConstruct = + common::Indirection; + if (!std::holds_alternative(sc.u)) { + return rit; + } + // Got OpenMPDeclarativeConstruct. If it's not a utility construct + // then stop. + auto &odc = std::get(sc.u).value(); + if (!std::holds_alternative(odc.u)) { + return rit; + } + } + return decls.rend(); + }(); + + std::transform(decls.rbegin(), rlast, std::front_inserter(block), + [](parser::DeclarationConstruct &dc) { + auto &sc = std::get(dc.u); + using OpenMPDeclarativeConstruct = + common::Indirection; + auto &oc = std::get(sc.u).value(); + auto &ut = std::get(oc.u); + + return parser::ExecutionPartConstruct(parser::ExecutableConstruct( + common::Indirection(parser::OpenMPConstruct(std::move(ut))))); + }); + + decls.erase(rlast.base(), decls.end()); + } + + void MoveUtilityConstructsFromOmps( + std::list &omps, + parser::Block &block) { + using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct; + // Find the trailing range of OpenMPDeclarativeConstruct that are OpenMP + // utility construct, that are to be moved to the execution part. + std::list::reverse_iterator rlast = [&]() { + for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) { + OpenMPDeclarativeConstruct &dc = *rit; + if (!std::holds_alternative(dc.u)) { + return rit; + } + } + return omps.rend(); + }(); + + std::transform(omps.rbegin(), rlast, std::front_inserter(block), + [](parser::OpenMPDeclarativeConstruct &dc) { + auto &ut = std::get(dc.u); + return parser::ExecutionPartConstruct(parser::ExecutableConstruct( + common::Indirection(parser::OpenMPConstruct(std::move(ut))))); + }); + + omps.erase(rlast.base(), omps.end()); + } + + // Mapping from the specification parts to the blocks that follow in the + // same construct. This is for converting utility constructs to executable + // constructs. + std::map blockForSpec_; parser::Messages &messages_; }; diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 3a928c8a0289b..4c6a408a9ef30 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -614,6 +614,14 @@ void OmpStructureChecker::Leave(const parser::OpenMPConstruct &) { deferredNonVariables_.clear(); } +void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeConstruct &x) { + EnterDirectiveNest(DeclarativeNest); +} + +void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeConstruct &x) { + ExitDirectiveNest(DeclarativeNest); +} + void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) { loopStack_.push_back(&x); const auto &beginLoopDir{std::get(x.t)}; @@ -1697,6 +1705,16 @@ void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) { dirContext_.pop_back(); } +void OmpStructureChecker::Enter(const parser::OmpClause::At &x) { + CheckAllowedClause(llvm::omp::Clause::OMPC_at); + if (GetDirectiveNest(DeclarativeNest) > 0) { + if (x.v.v == parser::OmpAtClause::ActionTime::Execution) { + context_.Say(GetContext().clauseSource, + "The ERROR directive with AT(EXECUTION) cannot appear in the specification part"_err_en_US); + } + } +} + void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { isPredefinedAllocator = true; const auto &dir{std::get(x.t)}; @@ -2856,7 +2874,6 @@ CHECK_SIMPLE_CLAUSE(Init, OMPC_init) CHECK_SIMPLE_CLAUSE(Use, OMPC_use) CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) -CHECK_SIMPLE_CLAUSE(At, OMPC_at) CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) CHECK_SIMPLE_CLAUSE(Message, OMPC_message) CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 2a4f6fbd618c3..f47c01c00499a 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -73,6 +73,9 @@ class OmpStructureChecker void Enter(const parser::OpenMPConstruct &); void Leave(const parser::OpenMPConstruct &); + void Enter(const parser::OpenMPDeclarativeConstruct &); + void Leave(const parser::OpenMPDeclarativeConstruct &); + void Enter(const parser::OpenMPLoopConstruct &); void Leave(const parser::OpenMPLoopConstruct &); void Enter(const parser::OmpEndLoopDirective &); @@ -270,11 +273,12 @@ class OmpStructureChecker const parser::Variable &, const parser::Expr &); inline void ErrIfNonScalarAssignmentStmt( const parser::Variable &, const parser::Expr &); - enum directiveNestType { + enum directiveNestType : int { SIMDNest, TargetBlockOnlyTeams, TargetNest, - LastType + DeclarativeNest, + LastType = DeclarativeNest, }; int directiveNest_[LastType + 1] = {0}; diff --git a/flang/test/Parser/OpenMP/error-unparse.f90 b/flang/test/Parser/OpenMP/error-unparse.f90 index 4dd06b736da80..2cb4e1a083a6c 100644 --- a/flang/test/Parser/OpenMP/error-unparse.f90 +++ b/flang/test/Parser/OpenMP/error-unparse.f90 @@ -1,23 +1,27 @@ -! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE" +! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE" program main character(*), parameter :: message = "This is an error" !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(WARNING) MESSAGE("some message here") !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Warning - !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> LiteralConstant -> CharLiteralConstant + !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"some message here"' + !PARSE-TREE: LiteralConstant -> CharLiteralConstant + !PARSE-TREE: string = 'some message here' !$omp error at(compilation) severity(warning) message("some message here") - !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE(message) + !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE("This is an error") !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal - !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' + !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"' + !PARSE-TREE: Designator -> DataRef -> Name = 'message' !$omp error at(compilation) severity(fatal) message(message) - !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE(message) + !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE("This is an error") !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Execution !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal - !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message' + !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"' + !PARSE-TREE: Designator -> DataRef -> Name = 'message' !$omp error at(EXECUTION) severity(fatal) message(message) end program main diff --git a/flang/test/Parser/OpenMP/nothing.f90 b/flang/test/Parser/OpenMP/nothing.f90 index 80c0932087610..22558c493c444 100644 --- a/flang/test/Parser/OpenMP/nothing.f90 +++ b/flang/test/Parser/OpenMP/nothing.f90 @@ -11,3 +11,103 @@ subroutine f00 !PARSE-TREE: ExecutionPart -> Block !PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective + +subroutine f01 + block + import, none + integer :: x + !$omp nothing ! "nothing" in the execution part + x = x+1 + end block +end + +!UNPARSE: SUBROUTINE f01 +!UNPARSE: BLOCK +!UNPARSE: IMPORT, NONE +!UNPARSE: INTEGER x +!UNPARSE: !$OMP NOTHING +!UNPARSE: x=x+1_4 +!UNPARSE: END BLOCK +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: BlockStmt -> +!PARSE-TREE: BlockSpecificationPart -> SpecificationPart +!PARSE-TREE: | ImportStmt +!PARSE-TREE: | ImplicitPart -> +!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!PARSE-TREE: | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!PARSE-TREE: | | EntityDecl +!PARSE-TREE: | | | Name = 'x' +!PARSE-TREE: Block +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4' +!PARSE-TREE: | | Variable = 'x' +!PARSE-TREE: | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | Expr = 'x+1_4' +!PARSE-TREE: | | | Add +!PARSE-TREE: | | | | Expr = 'x' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | Expr = '1_4' +!PARSE-TREE: | | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: EndBlockStmt -> + +subroutine f02 + integer :: x + !$omp nothing +end + +!UNPARSE: SUBROUTINE f02 +!UNPARSE: INTEGER x +!UNPARSE: !$OMP NOTHING +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: SpecificationPart +!PARSE-TREE: | ImplicitPart -> +!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!PARSE-TREE: | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!PARSE-TREE: | | EntityDecl +!PARSE-TREE: | | | Name = 'x' +!PARSE-TREE: ExecutionPart -> Block +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective + +subroutine f03 + block + !$omp nothing ! "nothing" in the specification part + import, none + integer :: x + x = x+1 + end block +end + +!UNPARSE: SUBROUTINE f03 +!UNPARSE: BLOCK +!UNPARSE: !$OMP NOTHING +!UNPARSE: IMPORT, NONE +!UNPARSE: INTEGER x +!UNPARSE: x=x+1_4 +!UNPARSE: END BLOCK +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: ExecutionPart -> Block +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> BlockConstruct +!PARSE-TREE: | | BlockStmt -> +!PARSE-TREE: | | BlockSpecificationPart -> SpecificationPart +!PARSE-TREE: | | | OpenMPDeclarativeConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | | | ImportStmt +!PARSE-TREE: | | | ImplicitPart -> +!PARSE-TREE: | | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!PARSE-TREE: | | | | EntityDecl +!PARSE-TREE: | | | | | Name = 'x' +!PARSE-TREE: | | Block +!PARSE-TREE: | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4' +!PARSE-TREE: | | | | Variable = 'x' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | Expr = 'x+1_4' +!PARSE-TREE: | | | | | Add +!PARSE-TREE: | | | | | | Expr = 'x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | | | Expr = '1_4' +!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: | | EndBlockStmt -> +!PARSE-TREE: EndSubroutineStmt -> \ No newline at end of file diff --git a/flang/test/Semantics/OpenMP/error.f90 b/flang/test/Semantics/OpenMP/error.f90 new file mode 100644 index 0000000000000..067417a8cda3b --- /dev/null +++ b/flang/test/Semantics/OpenMP/error.f90 @@ -0,0 +1,8 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51 + +subroutine f00(x) +!ERROR: The ERROR directive with AT(EXECUTION) cannot appear in the specification part + !$omp error at(execution) message("Haaa!") + integer :: x +end + From faa30be101e9ae2bdb58d2acb250341f1b13031c Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 3 Jan 2025 16:35:02 +0100 Subject: [PATCH 087/480] [mlir][Transforms] Fix build after #116524 (#121578) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build errors after #116524. ``` error: call of overloaded ‘TypeRange(ValueVector&)’ is ambiguous ``` --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 0c5520988eff3..6c3863e4c7f66 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -227,7 +227,7 @@ ConversionValueMapping::lookupOrDefault(Value from, ValueVector current{from}; do { // Store the current value if the types match. - if (TypeRange(current) == desiredTypes) + if (TypeRange(ValueRange(current)) == desiredTypes) desiredValue = current; // If possible, Replace each value with (one or multiple) mapped values. @@ -271,9 +271,8 @@ ConversionValueMapping::lookupOrDefault(Value from, ValueVector ConversionValueMapping::lookupOrNull(Value from, TypeRange desiredTypes) const { ValueVector result = lookupOrDefault(from, desiredTypes); - TypeRange resultTypes(result); if (result == ValueVector{from} || - (!desiredTypes.empty() && resultTypes != desiredTypes)) + (!desiredTypes.empty() && TypeRange(ValueRange(result)) != desiredTypes)) return {}; return result; } @@ -1291,7 +1290,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( } ValueVector repl = mapping.lookupOrDefault(operand, legalTypes); - if (!repl.empty() && TypeRange(repl) == legalTypes) { + if (!repl.empty() && TypeRange(ValueRange(repl)) == legalTypes) { // Mapped values have the correct type or there is an existing // materialization. Or the operand is not mapped at all and has the // correct type. From 5137c209f0c19668d06e48cc4293e4c01a77c964 Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 3 Jan 2025 16:46:15 +0100 Subject: [PATCH 088/480] [Flang][OpenMP] Fix allocating arrays with size intrinisic (#119226) Attempt to address the following example from causing an assert or ICE: ``` subroutine test(a) implicit none integer :: i real(kind=real64), dimension(:) :: a real(kind=real64), dimension(size(a, 1)) :: b !$omp target map(tofrom: b) do i = 1, 10 b(i) = i end do !$omp end target end subroutine ``` Where we utilise a Fortran intrinsic (size) to calculate the size of allocatable arrays and then map it to device. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 54 ++++++++++++++++--- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 14 +++-- .../Lower/OpenMP/allocatable-array-bounds.f90 | 7 +-- flang/test/Lower/OpenMP/array-bounds.f90 | 2 +- .../OpenMP/derived-type-allocatable-map.f90 | 8 +-- .../local-intrinsic-sized-array-map.f90 | 32 +++++++++++ .../Transforms/omp-map-info-finalization.fir | 14 ++--- ...target-map-local-intrinisc-sized-param.f90 | 39 ++++++++++++++ 8 files changed, 143 insertions(+), 27 deletions(-) create mode 100644 flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 create mode 100644 offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 0a84162291573..cd4b25a17722c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -923,13 +923,24 @@ static void genBodyOfTargetOp( while (!valuesDefinedAbove.empty()) { for (mlir::Value val : valuesDefinedAbove) { mlir::Operation *valOp = val.getDefiningOp(); - if (mlir::isMemoryEffectFree(valOp)) { + assert(valOp != nullptr); + + // NOTE: We skip BoxDimsOp's as the lesser of two evils is to map the + // indices separately, as the alternative is to eventually map the Box, + // which comes with a fairly large overhead comparatively. We could be + // more robust about this and check using a BackwardsSlice to see if we + // run the risk of mapping a box. + if (mlir::isMemoryEffectFree(valOp) && + !mlir::isa(valOp)) { mlir::Operation *clonedOp = valOp->clone(); entryBlock->push_front(clonedOp); - val.replaceUsesWithIf(clonedOp->getResult(0), - [entryBlock](mlir::OpOperand &use) { - return use.getOwner()->getBlock() == entryBlock; - }); + + auto replace = [entryBlock](mlir::OpOperand &use) { + return use.getOwner()->getBlock() == entryBlock; + }; + + valOp->getResults().replaceUsesWithIf(clonedOp->getResults(), replace); + valOp->replaceUsesWithIf(clonedOp, replace); } else { auto savedIP = firOpBuilder.getInsertionPoint(); firOpBuilder.setInsertionPointAfter(valOp); @@ -937,9 +948,36 @@ static void genBodyOfTargetOp( firOpBuilder.createTemporary(val.getLoc(), val.getType()); firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal); - llvm::SmallVector bounds; + lower::AddrAndBoundsInfo info = lower::getDataOperandBaseAddr( + firOpBuilder, val, /*isOptional=*/false, val.getLoc()); + llvm::SmallVector bounds = + Fortran::lower::genImplicitBoundsOps( + firOpBuilder, info, + hlfir::translateToExtendedValue(val.getLoc(), firOpBuilder, + hlfir::Entity{val}) + .first, + /*dataExvIsAssumedSize=*/false, val.getLoc()); + std::stringstream name; firOpBuilder.setInsertionPoint(targetOp); + + llvm::omp::OpenMPOffloadMappingFlags mapFlag = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::VariableCaptureKind captureKind = + mlir::omp::VariableCaptureKind::ByRef; + + mlir::Type eleType = copyVal.getType(); + if (auto refType = + mlir::dyn_cast(copyVal.getType())) + eleType = refType.getElementType(); + + if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { + captureKind = mlir::omp::VariableCaptureKind::ByCopy; + } else if (!fir::isa_builtin_cptr_type(eleType)) { + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + } + mlir::Value mapOp = createMapInfoOp( firOpBuilder, copyVal.getLoc(), copyVal, /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, @@ -947,8 +985,8 @@ static void genBodyOfTargetOp( /*membersIndex=*/mlir::ArrayAttr{}, static_cast< std::underlying_type_t>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT), - mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType()); + mapFlag), + captureKind, copyVal.getType()); // Get the index of the first non-map argument before modifying mapVars, // then append an element to mapVars and an associated entry block diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index ad7b806ae262a..e823443958714 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -162,13 +162,19 @@ class MapInfoFinalizationPass mlir::Value baseAddrAddr = builder.create( loc, descriptor, fir::BoxFieldAttr::base_addr); + mlir::Type underlyingVarType = + llvm::cast( + fir::unwrapRefType(baseAddrAddr.getType())) + .getElementType(); + if (auto seqType = llvm::dyn_cast(underlyingVarType)) + if (seqType.hasDynamicExtents()) + underlyingVarType = seqType.getEleTy(); + // Member of the descriptor pointing at the allocated data return builder.create( loc, baseAddrAddr.getType(), descriptor, - mlir::TypeAttr::get(llvm::cast( - fir::unwrapRefType(baseAddrAddr.getType())) - .getElementType()), - baseAddrAddr, /*members=*/mlir::SmallVector{}, + mlir::TypeAttr::get(underlyingVarType), baseAddrAddr, + /*members=*/mlir::SmallVector{}, /*membersIndex=*/mlir::ArrayAttr{}, bounds, builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), builder.getAttr( diff --git a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 index e162c5a2d6d69..e66b6f17d8858 100644 --- a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 +++ b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 @@ -23,7 +23,7 @@ !HOST: %[[BOX_3:.*]]:3 = fir.box_dims %[[LOAD_3]], %[[CONSTANT_3]] : (!fir.box>>, index) -> (index, index, index) !HOST: %[[BOUNDS_1:.*]] = omp.map.bounds lower_bound(%[[LB_1]] : index) upper_bound(%[[UB_1]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_1]]#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP_INFO_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "sp_read(2:5)"} !HOST: %[[LOAD_3:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref>>> @@ -41,7 +41,7 @@ !HOST: %[[BOX_5:.*]]:3 = fir.box_dims %[[LOAD_5]], %[[CONSTANT_5]] : (!fir.box>>, index) -> (index, index, index) !HOST: %[[BOUNDS_2:.*]] = omp.map.bounds lower_bound(%[[LB_2]] : index) upper_bound(%[[UB_2]] : index) extent(%[[BOX_5]]#1 : index) stride(%[[BOX_4]]#2 : index) start_idx(%[[BOX_3]]#0 : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_2]]#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP_INFO_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "sp_write(2:5)"} subroutine read_write_section() @@ -80,8 +80,9 @@ module assumed_allocatable_array_routines !HOST: %[[BOX_3:.*]]:3 = fir.box_dims %[[LOAD_3]], %[[CONSTANT_3]] : (!fir.box>>, index) -> (index, index, index) !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB]] : index) upper_bound(%[[UB]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE]]#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP_INFO:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "arr_read_write(2:5)"} + subroutine assumed_shape_array(arr_read_write) integer, allocatable, intent(inout) :: arr_read_write(:) diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90 index 78fa81567ca54..479b6887a83f4 100644 --- a/flang/test/Lower/OpenMP/array-bounds.f90 +++ b/flang/test/Lower/OpenMP/array-bounds.f90 @@ -51,7 +51,7 @@ module assumed_array_routines !HOST: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#1, %[[C0_1]] : (!fir.box>, index) -> (index, index, index) !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true} !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %0 base_addr : (!fir.ref>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} +!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} !HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref> {name = "arr_read_write(2:5)"} !HOST: omp.target map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}}, %[[MAP_INFO_MEMBER]] -> %{{.*}} : !fir.ref>, !fir.ref, !fir.llvm_ptr>>) { subroutine assumed_shape_array(arr_read_write) diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 index 47bcf2a7229ea..28a2b9b5b967b 100644 --- a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 +++ b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 @@ -6,7 +6,7 @@ !CHECK: %[[MEMBER_INDEX:.*]] = arith.constant 4 : index !CHECK: %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[DECLARE]]#0, %[[MEMBER_INDEX]] : (!fir.ref>, index) -> !fir.ref>>> !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>, !fir.type<[[ONE_LAYER_TY]]>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_DESCRIPTOR]], %[[MAP_MEMBER_BASE_ADDR]] : [4], [4, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{{.*}} partial_map = true} !CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG1:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref>>>, !fir.llvm_ptr>>) { @@ -37,7 +37,7 @@ subroutine dtype_alloca_map_op_block() !CHECK: %[[MEMBER_INDEX:.*]] = arith.constant 4 : index !CHECK: %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_DTYPE]], %[[MEMBER_INDEX]] : (!fir.box>>, index) -> !fir.ref>>> !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[LOAD_DTYPE:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref>>> !CHECK: %[[MEMBER_COORD:.*]] = arith.constant 5 : index @@ -78,7 +78,7 @@ subroutine alloca_dtype_op_block_add() !CHECK: %[[NESTED_MEMBER_INDEX:.*]] = arith.constant 2 : index !CHECK: %[[NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %[[NESTED_MEMBER_INDEX]] : (!fir.ref>, index) -> !fir.ref>>> !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_NESTED_MEMBER_COORD:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[LOAD:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref}>>>> !CHECK: %[[NESTED_DTYPE_INDEX:.*]] = arith.constant 6 : index @@ -128,7 +128,7 @@ subroutine alloca_nest_dype_map_op_block_add() !CHECK: %[[NESTED_MEMBER_INDEX:.*]] = arith.constant 2 : index !CHECK: %[[NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %[[NESTED_MEMBER_INDEX]] : (!fir.ref>, index) -> !fir.ref>>> !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} !CHECK: %[[MAP_NESTED_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} !CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_NESTED_MEMBER_DESC]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] : [6, 2], [6, 2, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{.*}} !CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_NESTED_MEMBER_DESC]] -> %[[ARG1:.*]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref>>>, !fir.llvm_ptr>>) { diff --git a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 new file mode 100644 index 0000000000000..ab2cdf380b783 --- /dev/null +++ b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 @@ -0,0 +1,32 @@ +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="HLFIRDIALECT" + +!HLFIRDIALECT: func.func @_QPlocal_variable_intrinsic_size(%[[ARG0:.*]]: !fir.box> {fir.bindc_name = "a"}) { +!HLFIRDIALECT: %[[SZ_DATA:.*]] = fir.alloca index +!HLFIRDIALECT: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope {{.*}} {uniq_name = "_QFlocal_variable_intrinsic_sizeEa"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +!HLFIRDIALECT: %[[DIMENSIONS:.*]]:3 = fir.box_dims %[[DECLARE]]#0, %{{.*}} : (!fir.box>, index) -> (index, index, index) +!HLFIRDIALECT: fir.store %[[DIMENSIONS]]#1 to %[[SZ_DATA]] : !fir.ref +!HLFIRDIALECT: %[[SIZE_SEL:.*]] = arith.select {{.*}}, {{.*}}, {{.*}} : index +!HLFIRDIALECT: %[[B_ALLOCA:.*]] = fir.alloca !fir.array, %[[SIZE_SEL]] {bindc_name = "b", uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} +!HLFIRDIALECT: %[[B_SHAPE:.*]] = fir.shape %[[SIZE_SEL]] : (index) -> !fir.shape<1> +!HLFIRDIALECT: %[[B_DECLARE:.*]]:2 = hlfir.declare %[[B_ALLOCA]](%[[B_SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +!HLFIRDIALECT: %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}} : index) upper_bound({{.*}} : index) extent({{.*}} : index) stride({{.*}} : index) start_idx({{.*}} : index) {stride_in_bytes = true} +!HLFIRDIALECT: %[[MAP_DATA_B:.*]] = omp.map.info var_ptr(%[[B_DECLARE]]#1 : !fir.ref>, f32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "b"} +!HLFIRDIALECT: %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref, index) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} +!HLFIRDIALECT: omp.target map_entries(%[[MAP_DATA_B]] -> %[[ARG1:.*]], %[[MAP_DATA_SZ]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref) { +!HLFIRDIALECT: %[[SZ_LD:.*]] = fir.load %[[ARG2]] : !fir.ref +!HLFIRDIALECT: %[[SZ_CONV:.*]] = fir.convert %[[SZ_LD]] : (index) -> i64 +!HLFIRDIALECT: %[[SZ_CONV2:.*]] = fir.convert %[[SZ_CONV]] : (i64) -> index +!HLFIRDIALECT: %[[SEL_SZ:.*]] = arith.cmpi sgt, %[[SZ_CONV2]], %{{.*}} : index +!HLFIRDIALECT: %[[SEL_SZ2:.*]] = arith.select %[[SEL_SZ]], %[[SZ_CONV2]], %{{.*}} : index +!HLFIRDIALECT: %[[SHAPE:.*]] = fir.shape %[[SEL_SZ2]] : (index) -> !fir.shape<1> +!HLFIRDIALECT: %{{.*}} = hlfir.declare %[[ARG1]](%[[SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) + +subroutine local_variable_intrinsic_size(a) + implicit none + real, dimension(:) :: a + real, dimension(size(a, 1)) :: b + +!$omp target map(tofrom: b) + b(5) = 5 +!$omp end target +end subroutine diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir index 74b87152d5b05..19e6dcad068cd 100644 --- a/flang/test/Transforms/omp-map-info-finalization.fir +++ b/flang/test/Transforms/omp-map-info-finalization.fir @@ -35,7 +35,7 @@ func.func @test_descriptor_expansion_pass(%arg0: !fir.box>) { // CHECK: %[[DESC_PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[DESC_MEMBER_MAP]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> // CHECK: fir.store %[[DECLARE1]]#1 to %[[ALLOCA]] : !fir.ref>> // CHECK: %[[BASE_ADDR_OFF_2:.*]] = fir.box_offset %[[ALLOCA]] base_addr : (!fir.ref>>) -> !fir.llvm_ptr>> -// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref>>, !fir.array) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} +// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref>>, i32) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} // CHECK: %[[DESC_PARENT_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[DESC_MEMBER_MAP_2]] : [0] : !fir.llvm_ptr>>) -> !fir.ref> // CHECK: omp.target map_entries(%[[DESC_PARENT_MAP]] -> %[[ARG1:.*]], %[[DESC_PARENT_MAP_2]] -> %[[ARG2:.*]], %[[DESC_MEMBER_MAP]] -> %[[ARG3:.*]], %[[DESC_MEMBER_MAP_2]] -> %[[ARG4:.*]] : {{.*}}) { @@ -115,7 +115,7 @@ func.func @dtype_alloca_op_block_add(%arg0: !fir.ref>, index) -> !fir.ref>>> // CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD:.*]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {name = "one_l%array_j"} // CHECK: %[[MAP_MEMBER_PARENT:.*]] = omp.map.info var_ptr(%[[ALLOCA]]#0 : !fir.ref<[[REC_TY]]>>, [[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%10, %9 : [4], [4, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref<[[REC_TY]]>> {{.*}} // CHECK: omp.target map_entries(%[[MAP_MEMBER_PARENT]] -> %[[ARG1:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG2:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]] : !fir.ref<[[REC_TY]]>>, !fir.ref>>>, !fir.llvm_ptr>>) { @@ -157,7 +157,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>> // CHECK: %[[ALLOCATABLE_MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_ALLOCA]], %{{.*}} : (!fir.box>>, index) -> !fir.ref>>> // CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[ALLOCATABLE_MEMBER_COORD]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref>>>, i32) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_ALLOCA_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[LOAD_ALLOCA2:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref>>> // CHECK: %[[REGULAR_MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_ALLOCA2]], %{{.*}} : (!fir.box>>, index) -> !fir.ref @@ -208,7 +208,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>, index) -> !fir.ref,array_k:!fir.box>>,k:i32}]]>> // CHECK: %[[NESTED_ALLOCA_MEMBER:.*]] = fir.coordinate_of %[[INTERMEDIATE_DTYPE_NESTED_MEMBER]], %{{.*}} : (!fir.ref>, index) -> !fir.ref>>> // CHECK: %[[NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_ALLOCA_MEMBER]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[NESTED_ALLOCA_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref>>>, i32) var_ptr_ptr(%[[NESTED_ALLOCA_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_NESTED_ALLOCA_MEMBER:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[ALLOCA_LOAD2:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref>>> // CHECK: %[[INTERMEDIATE_DTYPE_NESTED_MEMBER2:.*]] = fir.coordinate_of %[[ALLOCA_LOAD2]], %{{.*}} : (!fir.box>>, index) -> !fir.ref> @@ -252,7 +252,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>, index) -> !fir.ref,array_k:!fir.box>>,k:i32}]]>> // CHECK: %[[ALLOCATABLE_MEMBER:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %{{.*}} : (!fir.ref>, index) -> !fir.ref>>> // CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[ALLOCATABLE_MEMBER]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref>>>, i32) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[MAP_ALLOCATABLE_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ALLOCA]]#0 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%12, %11 : [6, 2], [6, 2, 0] : !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{.*}} // CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG1:.*]], %[[MAP_ALLOCATABLE_MEMBER_DESCRIPTOR]] -> %[[ARG2:.*]], %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]] : !fir.ref>, !fir.ref>>>, !fir.llvm_ptr>>) { @@ -286,13 +286,13 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>) -> (!fir.ref>, !fir.ref>) // CHECK: %[[DESC_1:.*]] = fir.coordinate_of %[[DECLARE]]#0, %{{.*}} : (!fir.ref>, index) -> !fir.ref>>,vertexy:!fir.box>>}]]>>>>> // CHECK: %[[BASE_ADDR_1:.*]] = fir.box_offset %[[DESC_1]] base_addr : (!fir.ref>>>>) -> !fir.llvm_ptr>>> -// CHECK: %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.array>) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>>> {{.*}} +// CHECK: %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.type<[[REC_TY2]]>) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>>> {{.*}} // CHECK: %[[DESC_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.box>>>) map_clauses(to) capture(ByRef) -> !fir.ref>>>> {{.*}} // CHECK: %[[DESC_LD_1:.*]] = fir.load %[[DESC_1]] : !fir.ref>>>> // CHECK: %[[MEMBER_ACCESS_1:.*]] = fir.coordinate_of %[[DESC_LD_1]], %{{.*}} : (!fir.box>>>, index) -> !fir.ref> // CHECK: %[[DESC_2:.*]] = fir.coordinate_of %[[MEMBER_ACCESS_1]], %{{.*}} : (!fir.ref>, index) -> !fir.ref>>> // CHECK: %[[BASE_ADDR_2:.*]] = fir.box_offset %[[DESC_2]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> -// CHECK: %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, !fir.array) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>> {{.*}} +// CHECK: %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, i32) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[DESC_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} // CHECK: %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{{.*}} partial_map = true} // CHECK: omp.target map_entries(%[[TOP_PARENT_MAP]] -> %{{.*}}, %[[DESC_MAP_1]] -> %{{.*}}, %[[BASE_ADDR_MAP_1]] -> %{{.*}}, %[[DESC_MAP_2]] -> %{{.*}}, %[[BASE_ADDR_MAP_2]] -> %{{.*}} : !fir.ref>, !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) { diff --git a/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 b/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 new file mode 100644 index 0000000000000..b4fded7b3c70a --- /dev/null +++ b/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 @@ -0,0 +1,39 @@ +! Offloading test checking interaction of an local array +! sized utilising an input parameter and the size intrinsic +! when being mapped to device. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module mod + use iso_fortran_env, only: real64 + implicit none +contains + subroutine test(a) + implicit none + integer :: i + real(kind=real64), dimension(:) :: a + real(kind=real64), dimension(size(a, 1)) :: b + +!$omp target map(tofrom: b) + do i = 1, 10 + b(i) = i + end do +!$omp end target + + print *, b + end subroutine +end module mod + +program main + use mod + real(kind=real64), allocatable :: a(:) + allocate(a(10)) + + do i = 1, 10 + a(i) = i + end do + + call test(a) +end program main + +!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. From b9482ceb97f7cf7cde707dd81a0149dc9958ae53 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Fri, 3 Jan 2025 08:17:52 -0800 Subject: [PATCH 089/480] [flang] Improve designate/elemental indices match in opt-bufferization. (#121371) This pattern appears in `tonto`: `rys1%w = rys1%w * ...`, where component `w` is a pointer. Due to the computations transforming the elemental's one-based indices to the array indices, the indices match check did not pass in opt-bufferization. This patch recognizes this indices adjusting pattern, and returns the one-based indices for the designator. --- .../Transforms/OptimizedBufferization.cpp | 76 ++++++++++++++++++- .../opt-bufferization-same-ptr-elemental.fir | 69 +++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index bf3cf861e46f4..bfaabed013678 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -87,6 +87,13 @@ class ElementalAssignBufferization /// determines if the transformation can be applied to this elemental static std::optional findMatch(hlfir::ElementalOp elemental); + /// Returns the array indices for the given hlfir.designate. + /// It recognizes the computations used to transform the one-based indices + /// into the array's lb-based indices, and returns the one-based indices + /// in these cases. + static llvm::SmallVector + getDesignatorIndices(hlfir::DesignateOp designate); + public: using mlir::OpRewritePattern::OpRewritePattern; @@ -430,6 +437,73 @@ bool ArraySectionAnalyzer::isLess(mlir::Value v1, mlir::Value v2) { return false; } +llvm::SmallVector +ElementalAssignBufferization::getDesignatorIndices( + hlfir::DesignateOp designate) { + mlir::Value memref = designate.getMemref(); + + // If the object is a box, then the indices may be adjusted + // according to the box's lower bound(s). Scan through + // the computations to try to find the one-based indices. + if (mlir::isa(memref.getType())) { + // Look for the following pattern: + // %13 = fir.load %12 : !fir.ref + // %14:3 = fir.box_dims %13, %c0 : (!fir.box<...>, index) -> ... + // %17 = arith.subi %14#0, %c1 : index + // %18 = arith.addi %arg2, %17 : index + // %19 = hlfir.designate %13 (%18) : (!fir.box<...>, index) -> ... + // + // %arg2 is a one-based index. + + auto isNormalizedLb = [memref](mlir::Value v, unsigned dim) { + // Return true, if v and dim are such that: + // %14:3 = fir.box_dims %13, %dim : (!fir.box<...>, index) -> ... + // %17 = arith.subi %14#0, %c1 : index + // %19 = hlfir.designate %13 (...) : (!fir.box<...>, index) -> ... + if (auto subOp = + mlir::dyn_cast_or_null(v.getDefiningOp())) { + auto cst = fir::getIntIfConstant(subOp.getRhs()); + if (!cst || *cst != 1) + return false; + if (auto dimsOp = mlir::dyn_cast_or_null( + subOp.getLhs().getDefiningOp())) { + if (memref != dimsOp.getVal() || + dimsOp.getResult(0) != subOp.getLhs()) + return false; + auto dimsOpDim = fir::getIntIfConstant(dimsOp.getDim()); + return dimsOpDim && dimsOpDim == dim; + } + } + return false; + }; + + llvm::SmallVector newIndices; + for (auto index : llvm::enumerate(designate.getIndices())) { + if (auto addOp = mlir::dyn_cast_or_null( + index.value().getDefiningOp())) { + for (unsigned opNum = 0; opNum < 2; ++opNum) + if (isNormalizedLb(addOp->getOperand(opNum), index.index())) { + newIndices.push_back(addOp->getOperand((opNum + 1) % 2)); + break; + } + + // If new one-based index was not added, exit early. + if (newIndices.size() <= index.index()) + break; + } + } + + // If any of the indices is not adjusted to the array's lb, + // then return the original designator indices. + if (newIndices.size() != designate.getIndices().size()) + return designate.getIndices(); + + return newIndices; + } + + return designate.getIndices(); +} + std::optional ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) { mlir::Operation::user_range users = elemental->getUsers(); @@ -557,7 +631,7 @@ ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) { << " at " << elemental.getLoc() << "\n"); return std::nullopt; } - auto indices = designate.getIndices(); + auto indices = getDesignatorIndices(designate); auto elementalIndices = elemental.getIndices(); if (indices.size() == elementalIndices.size() && std::equal(indices.begin(), indices.end(), elementalIndices.begin(), diff --git a/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir b/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir new file mode 100644 index 0000000000000..ae91930d44eb1 --- /dev/null +++ b/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir @@ -0,0 +1,69 @@ +// RUN: fir-opt --opt-bufferization %s | FileCheck %s + +// Verify that the hlfir.assign of hlfir.elemental is optimized +// into element-per-element assignment: +// subroutine test1(p) +// real, pointer :: p(:) +// p = p + 1.0 +// end subroutine test1 + +func.func @_QPtest1(%arg0: !fir.ref>>> {fir.bindc_name = "p"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 1.000000e+00 : f32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest1Ep"} : (!fir.ref>>>, !fir.dscope) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.load %1#0 : !fir.ref>>> + %3:3 = fir.box_dims %2, %c0 : (!fir.box>>, index) -> (index, index, index) + %4 = fir.shape %3#1 : (index) -> !fir.shape<1> + %5 = hlfir.elemental %4 unordered : (!fir.shape<1>) -> !hlfir.expr { + ^bb0(%arg1: index): + %6 = arith.subi %3#0, %c1 : index + %7 = arith.addi %arg1, %6 : index + %8 = hlfir.designate %2 (%7) : (!fir.box>>, index) -> !fir.ref + %9 = fir.load %8 : !fir.ref + %10 = arith.addf %9, %cst fastmath : f32 + hlfir.yield_element %10 : f32 + } + hlfir.assign %5 to %2 : !hlfir.expr, !fir.box>> + hlfir.destroy %5 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPtest1( +// CHECK-NOT: hlfir.assign +// CHECK: hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref +// CHECK-NOT: hlfir.assign + +// subroutine test2(p) +// real, pointer :: p(:,:) +// p = p + 1.0 +// end subroutine test2 +func.func @_QPtest2(%arg0: !fir.ref>>> {fir.bindc_name = "p"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 1.000000e+00 : f32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest2Ep"} : (!fir.ref>>>, !fir.dscope) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.load %1#0 : !fir.ref>>> + %3:3 = fir.box_dims %2, %c0 : (!fir.box>>, index) -> (index, index, index) + %4:3 = fir.box_dims %2, %c1 : (!fir.box>>, index) -> (index, index, index) + %5 = fir.shape %3#1, %4#1 : (index, index) -> !fir.shape<2> + %6 = hlfir.elemental %5 unordered : (!fir.shape<2>) -> !hlfir.expr { + ^bb0(%arg1: index, %arg2: index): + %7 = arith.subi %3#0, %c1 : index + %8 = arith.addi %arg1, %7 : index + %9 = arith.subi %4#0, %c1 : index + %10 = arith.addi %arg2, %9 : index + %11 = hlfir.designate %2 (%8, %10) : (!fir.box>>, index, index) -> !fir.ref + %12 = fir.load %11 : !fir.ref + %13 = arith.addf %12, %cst fastmath : f32 + hlfir.yield_element %13 : f32 + } + hlfir.assign %6 to %2 : !hlfir.expr, !fir.box>> + hlfir.destroy %6 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPtest2( +// CHECK-NOT: hlfir.assign +// CHECK: hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref +// CHECK-NOT: hlfir.assign From 3c700d131a35ce4b0063a4688dce4a0cb739ca83 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Fri, 3 Jan 2025 08:33:14 -0800 Subject: [PATCH 090/480] [flang] Extract hlfir.assign inlining from opt-bufferization. (#121544) Optimized bufferization can transform hlfir.assign into a loop nest doing element per element assignment, but it avoids doing so for RHS that is hlfir.expr. This is done to let ElementalAssignBufferization pattern to try to do a better job. This patch moves the hlfir.assign inlining after opt-bufferization, and enables it for hlfir.expr RHS. The hlfir.expr RHS cases are present in tonto, and this patch results in some nice improvements. Note that those cases are handled by other compilers also using array temporaries, so this patch seems to just get rid of the Assign runtime overhead/inefficiency. --- flang/include/flang/Optimizer/HLFIR/Passes.td | 4 + .../Optimizer/HLFIR/Transforms/CMakeLists.txt | 1 + .../HLFIR/Transforms/InlineHLFIRAssign.cpp | 152 ++++++++++++++++++ .../Transforms/OptimizedBufferization.cpp | 109 +------------ flang/lib/Optimizer/Passes/Pipelines.cpp | 2 + flang/test/Driver/mlir-pass-pipeline.f90 | 4 + flang/test/Fir/basic-program.fir | 4 + ...ble-assign.fir => inline-hlfir-assign.fir} | 57 ++++++- flang/test/HLFIR/maxloc-elemental.fir | 8 +- flang/test/HLFIR/minloc-elemental.fir | 16 +- .../HLFIR/opt-bufferization-eval_in_mem.fir | 7 +- flang/test/HLFIR/opt-bufferization.fir | 42 ----- 12 files changed, 228 insertions(+), 178 deletions(-) create mode 100644 flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp rename flang/test/HLFIR/{opt-variable-assign.fir => inline-hlfir-assign.fir} (84%) diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td index ed49f5093c965..644f1e3c3af2b 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.td +++ b/flang/include/flang/Optimizer/HLFIR/Passes.td @@ -49,4 +49,8 @@ def InlineElementals : Pass<"inline-elementals"> { let summary = "Inline chained hlfir.elemental operations"; } +def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> { + let summary = "Inline hlfir.assign operations"; +} + #endif //FORTRAN_DIALECT_HLFIR_PASSES diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt index d18df2ef49f10..25a532204dd05 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_flang_library(HLFIRTransforms BufferizeHLFIR.cpp ConvertToFIR.cpp InlineElementals.cpp + InlineHLFIRAssign.cpp LowerHLFIRIntrinsics.cpp LowerHLFIROrderedAssignments.cpp ScheduleOrderedAssignments.cpp diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp new file mode 100644 index 0000000000000..249976d5509b0 --- /dev/null +++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp @@ -0,0 +1,152 @@ +//===- InlineHLFIRAssign.cpp - Inline hlfir.assign ops --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Transform hlfir.assign array operations into loop nests performing element +// per element assignments. The inlining is done for trivial data types always, +// though, we may add performance/code-size heuristics in future. +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Analysis/AliasAnalysis.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/HLFIRTools.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace hlfir { +#define GEN_PASS_DEF_INLINEHLFIRASSIGN +#include "flang/Optimizer/HLFIR/Passes.h.inc" +} // namespace hlfir + +#define DEBUG_TYPE "inline-hlfir-assign" + +namespace { +/// Expand hlfir.assign of array RHS to array LHS into a loop nest +/// of element-by-element assignments: +/// hlfir.assign %4 to %5 : !fir.ref>, +/// !fir.ref> +/// into: +/// fir.do_loop %arg1 = %c1 to %c3 step %c1 unordered { +/// fir.do_loop %arg2 = %c1 to %c3 step %c1 unordered { +/// %6 = hlfir.designate %4 (%arg2, %arg1) : +/// (!fir.ref>, index, index) -> !fir.ref +/// %7 = fir.load %6 : !fir.ref +/// %8 = hlfir.designate %5 (%arg2, %arg1) : +/// (!fir.ref>, index, index) -> !fir.ref +/// hlfir.assign %7 to %8 : f32, !fir.ref +/// } +/// } +/// +/// The transformation is correct only when LHS and RHS do not alias. +/// When RHS is an array expression, then there is no aliasing. +/// This transformation does not support runtime checking for +/// non-conforming LHS/RHS arrays' shapes currently. +class InlineHLFIRAssignConversion + : public mlir::OpRewritePattern { +public: + using mlir::OpRewritePattern::OpRewritePattern; + + llvm::LogicalResult + matchAndRewrite(hlfir::AssignOp assign, + mlir::PatternRewriter &rewriter) const override { + if (assign.isAllocatableAssignment()) + return rewriter.notifyMatchFailure(assign, + "AssignOp may imply allocation"); + + hlfir::Entity rhs{assign.getRhs()}; + + if (!rhs.isArray()) + return rewriter.notifyMatchFailure(assign, + "AssignOp's RHS is not an array"); + + mlir::Type rhsEleTy = rhs.getFortranElementType(); + if (!fir::isa_trivial(rhsEleTy)) + return rewriter.notifyMatchFailure( + assign, "AssignOp's RHS data type is not trivial"); + + hlfir::Entity lhs{assign.getLhs()}; + if (!lhs.isArray()) + return rewriter.notifyMatchFailure(assign, + "AssignOp's LHS is not an array"); + + mlir::Type lhsEleTy = lhs.getFortranElementType(); + if (!fir::isa_trivial(lhsEleTy)) + return rewriter.notifyMatchFailure( + assign, "AssignOp's LHS data type is not trivial"); + + if (lhsEleTy != rhsEleTy) + return rewriter.notifyMatchFailure(assign, + "RHS/LHS element types mismatch"); + + if (!mlir::isa(rhs.getType())) { + // If RHS is not an hlfir.expr, then we should prove that + // LHS and RHS do not alias. + // TODO: if they may alias, we can insert hlfir.as_expr for RHS, + // and proceed with the inlining. + fir::AliasAnalysis aliasAnalysis; + mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs); + // TODO: use areIdenticalOrDisjointSlices() from + // OptimizedBufferization.cpp to check if we can still do the expansion. + if (!aliasRes.isNo()) { + LLVM_DEBUG(llvm::dbgs() << "InlineHLFIRAssign:\n" + << "\tLHS: " << lhs << "\n" + << "\tRHS: " << rhs << "\n" + << "\tALIAS: " << aliasRes << "\n"); + return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias"); + } + } + + mlir::Location loc = assign->getLoc(); + fir::FirOpBuilder builder(rewriter, assign.getOperation()); + builder.setInsertionPoint(assign); + rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); + lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); + mlir::Value shape = hlfir::genShape(loc, builder, lhs); + llvm::SmallVector extents = + hlfir::getIndexExtents(loc, builder, shape); + hlfir::LoopNest loopNest = + hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, + flangomp::shouldUseWorkshareLowering(assign)); + builder.setInsertionPointToStart(loopNest.body); + auto rhsArrayElement = + hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); + rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement); + auto lhsArrayElement = + hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); + builder.create(loc, rhsArrayElement, lhsArrayElement); + rewriter.eraseOp(assign); + return mlir::success(); + } +}; + +class InlineHLFIRAssignPass + : public hlfir::impl::InlineHLFIRAssignBase { +public: + void runOnOperation() override { + mlir::MLIRContext *context = &getContext(); + + mlir::GreedyRewriteConfig config; + // Prevent the pattern driver from merging blocks. + config.enableRegionSimplification = + mlir::GreedySimplifyRegionLevel::Disabled; + + mlir::RewritePatternSet patterns(context); + patterns.insert(context); + + if (mlir::failed(mlir::applyPatternsGreedily( + getOperation(), std::move(patterns), config))) { + mlir::emitError(getOperation()->getLoc(), + "failure in hlfir.assign inlining"); + signalPassFailure(); + } + } +}; +} // namespace diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index bfaabed013678..0cfefc2d23ecb 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -772,108 +772,6 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite( return mlir::success(); } -/// Expand hlfir.assign of array RHS to array LHS into a loop nest -/// of element-by-element assignments: -/// hlfir.assign %4 to %5 : !fir.ref>, -/// !fir.ref> -/// into: -/// fir.do_loop %arg1 = %c1 to %c3 step %c1 unordered { -/// fir.do_loop %arg2 = %c1 to %c3 step %c1 unordered { -/// %6 = hlfir.designate %4 (%arg2, %arg1) : -/// (!fir.ref>, index, index) -> !fir.ref -/// %7 = fir.load %6 : !fir.ref -/// %8 = hlfir.designate %5 (%arg2, %arg1) : -/// (!fir.ref>, index, index) -> !fir.ref -/// hlfir.assign %7 to %8 : f32, !fir.ref -/// } -/// } -/// -/// The transformation is correct only when LHS and RHS do not alias. -/// This transformation does not support runtime checking for -/// non-conforming LHS/RHS arrays' shapes currently. -class VariableAssignBufferization - : public mlir::OpRewritePattern { -private: -public: - using mlir::OpRewritePattern::OpRewritePattern; - - llvm::LogicalResult - matchAndRewrite(hlfir::AssignOp assign, - mlir::PatternRewriter &rewriter) const override; -}; - -llvm::LogicalResult VariableAssignBufferization::matchAndRewrite( - hlfir::AssignOp assign, mlir::PatternRewriter &rewriter) const { - if (assign.isAllocatableAssignment()) - return rewriter.notifyMatchFailure(assign, "AssignOp may imply allocation"); - - hlfir::Entity rhs{assign.getRhs()}; - - // To avoid conflicts with ElementalAssignBufferization pattern, we avoid - // matching RHS when it is an `ExprType` defined by an `ElementalOp`; which is - // among the main criteria matched by ElementalAssignBufferization. - if (mlir::isa(rhs.getType()) && - mlir::isa(rhs.getDefiningOp())) - return rewriter.notifyMatchFailure( - assign, "RHS is an ExprType defined by ElementalOp"); - - if (!rhs.isArray()) - return rewriter.notifyMatchFailure(assign, - "AssignOp's RHS is not an array"); - - mlir::Type rhsEleTy = rhs.getFortranElementType(); - if (!fir::isa_trivial(rhsEleTy)) - return rewriter.notifyMatchFailure( - assign, "AssignOp's RHS data type is not trivial"); - - hlfir::Entity lhs{assign.getLhs()}; - if (!lhs.isArray()) - return rewriter.notifyMatchFailure(assign, - "AssignOp's LHS is not an array"); - - mlir::Type lhsEleTy = lhs.getFortranElementType(); - if (!fir::isa_trivial(lhsEleTy)) - return rewriter.notifyMatchFailure( - assign, "AssignOp's LHS data type is not trivial"); - - if (lhsEleTy != rhsEleTy) - return rewriter.notifyMatchFailure(assign, - "RHS/LHS element types mismatch"); - - fir::AliasAnalysis aliasAnalysis; - mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs); - // TODO: use areIdenticalOrDisjointSlices() to check if - // we can still do the expansion. - if (!aliasRes.isNo()) { - LLVM_DEBUG(llvm::dbgs() << "VariableAssignBufferization:\n" - << "\tLHS: " << lhs << "\n" - << "\tRHS: " << rhs << "\n" - << "\tALIAS: " << aliasRes << "\n"); - return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias"); - } - - mlir::Location loc = assign->getLoc(); - fir::FirOpBuilder builder(rewriter, assign.getOperation()); - builder.setInsertionPoint(assign); - rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs); - lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs); - mlir::Value shape = hlfir::genShape(loc, builder, lhs); - llvm::SmallVector extents = - hlfir::getIndexExtents(loc, builder, shape); - hlfir::LoopNest loopNest = - hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, - flangomp::shouldUseWorkshareLowering(assign)); - builder.setInsertionPointToStart(loopNest.body); - auto rhsArrayElement = - hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); - rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement); - auto lhsArrayElement = - hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); - builder.create(loc, rhsArrayElement, lhsArrayElement); - rewriter.eraseOp(assign); - return mlir::success(); -} - using GenBodyFn = std::function &)>; @@ -1280,9 +1178,9 @@ class ReductionMaskConversion : public mlir::OpRewritePattern { loc, resultArr, builder.createBool(loc, false)); // Check all the users - the destroy is no longer required, and any assign - // can use resultArr directly so that VariableAssignBufferization in this - // pass can optimize the results. Other operations are replaces with an - // AsExpr for the temporary resultArr. + // can use resultArr directly so that InlineHLFIRAssign pass + // can optimize the results. Other operations are replaced with an AsExpr + // for the temporary resultArr. llvm::SmallVector destroys; llvm::SmallVector assigns; for (auto user : mloc->getUsers()) { @@ -1430,7 +1328,6 @@ class OptimizedBufferizationPass // This requires small code reordering in ElementalAssignBufferization. patterns.insert(context); patterns.insert(context); - patterns.insert(context); patterns.insert(context); patterns.insert>(context); patterns.insert>(context); diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 72803aa3793ce..20e4599587c4b 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -234,6 +234,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, pm.addPass(mlir::createCSEPass()); addNestedPassToAllTopLevelOperations( pm, hlfir::createOptimizedBufferization); + addNestedPassToAllTopLevelOperations( + pm, hlfir::createInlineHLFIRAssign); } pm.addPass(hlfir::createLowerHLFIROrderedAssignments()); pm.addPass(hlfir::createLowerHLFIRIntrinsics()); diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index b30affe691b84..9655afce96d92 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -36,12 +36,16 @@ ! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] ! O2-NEXT: 'fir.global' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'func.func' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'omp.declare_reduction' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'omp.private' Pipeline ! O2-NEXT: OptimizedBufferization +! O2-NEXT: InlineHLFIRAssign ! ALL: LowerHLFIROrderedAssignments ! ALL-NEXT: LowerHLFIRIntrinsics ! ALL-NEXT: BufferizeHLFIR diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index d2788008c3893..620882ebbed2a 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -37,12 +37,16 @@ func.func @_QQmain() { // PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] // PASSES-NEXT: 'fir.global' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'func.func' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'omp.declare_reduction' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'omp.private' Pipeline // PASSES-NEXT: OptimizedBufferization +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: LowerHLFIROrderedAssignments // PASSES-NEXT: LowerHLFIRIntrinsics // PASSES-NEXT: BufferizeHLFIR diff --git a/flang/test/HLFIR/opt-variable-assign.fir b/flang/test/HLFIR/inline-hlfir-assign.fir similarity index 84% rename from flang/test/HLFIR/opt-variable-assign.fir rename to flang/test/HLFIR/inline-hlfir-assign.fir index 17124fa86af65..f834e7971e3d5 100644 --- a/flang/test/HLFIR/opt-variable-assign.fir +++ b/flang/test/HLFIR/inline-hlfir-assign.fir @@ -1,6 +1,5 @@ -// Test optimized bufferization for hlfir.assign of arrays -// variables: -// RUN: fir-opt --opt-bufferization %s | FileCheck %s +// Test inlining of hlfir.assign of arrays: +// RUN: fir-opt --inline-hlfir-assign %s | FileCheck %s // The two assigns come from the following source forms: // y(:,:) = x(:,:) @@ -302,3 +301,55 @@ func.func @_QPtest7(%arg0: !fir.ref>>> {f // CHECK-NOT: hlfir.assign // CHECK: hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref // CHECK-NOT: hlfir.assign + + +// Test that VAR = EXPR assignment is inlined: +// subroutine test_expr_rhs(p1, p2) +// logical, pointer :: p1(:), p2(:) +// p1 = (p2) +// end subroutine test_expr_rhs +func.func @_QPtest_expr_rhs(%arg0: !fir.ref>>>> {fir.bindc_name = "p1"}, %arg1: !fir.ref>>>> {fir.bindc_name = "p2"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_expr_rhsEp1"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) + %2:2 = hlfir.declare %arg1 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_expr_rhsEp2"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) + %3 = fir.load %2#0 : !fir.ref>>>> + %4:3 = fir.box_dims %3, %c0 : (!fir.box>>>, index) -> (index, index, index) + %5 = fir.shape %4#1 : (index) -> !fir.shape<1> + %6 = hlfir.elemental %5 unordered : (!fir.shape<1>) -> !hlfir.expr> { + ^bb0(%arg2: index): + %8 = arith.subi %4#0, %c1 : index + %9 = arith.addi %arg2, %8 : index + %10 = hlfir.designate %3 (%9) : (!fir.box>>>, index) -> !fir.ref> + %11 = fir.load %10 : !fir.ref> + %12 = hlfir.no_reassoc %11 : !fir.logical<4> + hlfir.yield_element %12 : !fir.logical<4> + } + %7 = fir.load %1#0 : !fir.ref>>>> + hlfir.assign %6 to %7 : !hlfir.expr>, !fir.box>>> + hlfir.destroy %6 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPtest_expr_rhs( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>>> {fir.bindc_name = "p1"}, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref>>>> {fir.bindc_name = "p2"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_expr_rhsEp1"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) +// CHECK: %[[VAL_10:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr> { +// CHECK: } +// CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref>>>> +// CHECK: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_17]], %[[VAL_3]] : (!fir.box>>>, index) -> (index, index, index) +// CHECK: fir.do_loop %[[VAL_19:.*]] = %[[VAL_2]] to %[[VAL_18]]#1 step %[[VAL_2]] unordered { +// CHECK: %[[VAL_20:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_19]] : (!hlfir.expr>, index) -> !fir.logical<4> +// CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_17]], %[[VAL_3]] : (!fir.box>>>, index) -> (index, index, index) +// CHECK: %[[VAL_22:.*]] = arith.subi %[[VAL_21]]#0, %[[VAL_2]] : index +// CHECK: %[[VAL_23:.*]] = arith.addi %[[VAL_19]], %[[VAL_22]] : index +// CHECK: %[[VAL_24:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_23]]) : (!fir.box>>>, index) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_20]] to %[[VAL_24]] : !fir.logical<4>, !fir.ref> +// CHECK: } +// CHECK: hlfir.destroy %[[VAL_10]] : !hlfir.expr> +// CHECK: return +// CHECK: } diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir index 497a58c9bd1d4..c9210a59f0340 100644 --- a/flang/test/HLFIR/maxloc-elemental.fir +++ b/flang/test/HLFIR/maxloc-elemental.fir @@ -68,13 +68,7 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) -// CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { -// CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: %[[V14:.*]] = fir.load %[[V13]] : !fir.ref -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3) : (!fir.box>, index) -> !fir.ref -// CHECK-NEXT: hlfir.assign %[[V14]] to %[[V15]] : i32, !fir.ref -// CHECK-NEXT: } +// CHECK-NEXT: hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref>, !fir.box> // CHECK-NEXT: return // CHECK-NEXT: } diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir index 5fa482a7b904e..9453a335b4fbf 100644 --- a/flang/test/HLFIR/minloc-elemental.fir +++ b/flang/test/HLFIR/minloc-elemental.fir @@ -68,13 +68,7 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) -// CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { -// CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: %[[V14:.*]] = fir.load %[[V13]] : !fir.ref -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3) : (!fir.box>, index) -> !fir.ref -// CHECK-NEXT: hlfir.assign %[[V14]] to %[[V15]] : i32, !fir.ref -// CHECK-NEXT: } +// CHECK-NEXT: hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref>, !fir.box> // CHECK-NEXT: return // CHECK-NEXT: } @@ -147,13 +141,7 @@ func.func @_QPtest_kind2(%arg0: !fir.box> {fir.bindc_name = "a // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) -// CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { -// CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: %[[V14:.*]] = fir.load %[[V13]] : !fir.ref -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3) : (!fir.box>, index) -> !fir.ref -// CHECK-NEXT: hlfir.assign %[[V14]] to %[[V15]] : i16, !fir.ref -// CHECK-NEXT: } +// CHECK-NEXT: hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref>, !fir.box> // CHECK-NEXT: return diff --git a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir index 984c0bcbaddcc..ce669073dbb1b 100644 --- a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir +++ b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir @@ -48,7 +48,6 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref> {fir.b } // CHECK-LABEL: func.func @_QPnegative_test_is_target( // CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> {fir.bindc_name = "x", fir.target}) { -// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index // CHECK: %[[VAL_2:.*]] = arith.constant false // CHECK: %[[VAL_3:.*]] = arith.constant 10 : index // CHECK: %[[VAL_4:.*]] = fir.alloca !fir.array<10xf32> @@ -57,11 +56,7 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref> {fir.b // CHECK: %[[VAL_9:.*]] = fir.call @_QPfoo() fastmath : () -> !fir.array<10xf32> // CHECK: fir.save_result %[[VAL_9]] to %[[VAL_8]]#1{{.*}} // CHECK: %[[VAL_10:.*]] = hlfir.as_expr %[[VAL_8]]#0 move %[[VAL_2]] : (!fir.ref>, i1) -> !hlfir.expr<10xf32> -// CHECK: fir.do_loop %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_3]] step %[[VAL_1]] unordered { -// CHECK: %[[VAL_12:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_11]] : (!hlfir.expr<10xf32>, index) -> f32 -// CHECK: %[[VAL_13:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_11]]) : (!fir.ref>, index) -> !fir.ref -// CHECK: hlfir.assign %[[VAL_12]] to %[[VAL_13]] : f32, !fir.ref -// CHECK: } +// CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : !hlfir.expr<10xf32>, !fir.ref> // CHECK: hlfir.destroy %[[VAL_10]] : !hlfir.expr<10xf32> // CHECK: return // CHECK: } diff --git a/flang/test/HLFIR/opt-bufferization.fir b/flang/test/HLFIR/opt-bufferization.fir index 87afb3cc92453..faa8f4bcdb778 100644 --- a/flang/test/HLFIR/opt-bufferization.fir +++ b/flang/test/HLFIR/opt-bufferization.fir @@ -796,45 +796,3 @@ func.func @_QPddx(%arg0: !fir.box> {fir.bindc_name = "array" // CHECK: %[[VAL_61:.*]] = fir.load %[[VAL_26]]#1 : !fir.ref> // CHECK: return %[[VAL_61]] : !fir.array // CHECK: } - -// `hlfir.expr` bufferization (when the expresion is not the result of -// `hlfir.elemental`) -func.func @_QPfoo() { - %c1 = arith.constant 1 : index - %0 = fir.alloca !fir.array<1xi32> {bindc_name = "iavs", uniq_name = "_QFfooEiavs"} - %1 = fir.shape %c1 : (index) -> !fir.shape<1> - %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFfooEiavs"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - %3 = fir.alloca i32 {bindc_name = "iv", uniq_name = "_QFfooEiv"} - %4:2 = hlfir.declare %3 {uniq_name = "_QFfooEiv"} : (!fir.ref) -> (!fir.ref, !fir.ref) - %c10_i32 = arith.constant 10 : i32 - %6 = fir.convert %c10_i32 : (i32) -> index - %7 = fir.convert %c1 : (index) -> i32 - %8:2 = fir.do_loop %arg0 = %c1 to %6 step %c1 iter_args(%arg1 = %7) -> (index, i32) { - fir.store %arg1 to %4#1 : !fir.ref - %9 = fir.allocmem !fir.array<1xi32> {bindc_name = ".tmp.arrayctor", uniq_name = ""} - %10 = fir.shape %c1 : (index) -> !fir.shape<1> - %11:2 = hlfir.declare %9(%10) {uniq_name = ".tmp.arrayctor"} : (!fir.heap>, !fir.shape<1>) -> (!fir.heap>, !fir.heap>) - %12 = fir.load %4#0 : !fir.ref - %13 = hlfir.designate %11#0 (%c1) : (!fir.heap>, index) -> !fir.ref - hlfir.assign %12 to %13 : i32, !fir.ref - %true = arith.constant true - %14 = hlfir.as_expr %11#0 move %true : (!fir.heap>, i1) -> !hlfir.expr<1xi32> - hlfir.assign %14 to %2#0 : !hlfir.expr<1xi32>, !fir.ref> - hlfir.destroy %14 : !hlfir.expr<1xi32> - %15 = arith.addi %arg0, %c1 : index - %16 = fir.convert %c1 : (index) -> i32 - %17 = fir.load %4#1 : !fir.ref - %18 = arith.addi %17, %16 : i32 - fir.result %15, %18 : index, i32 - } - fir.store %8#1 to %4#1 : !fir.ref - return -} - -// CHECK-LABEL: func.func @_QPfoo -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: fir.do_loop {{.*}} { -// CHECK-NOT: hlfir.assign %{{.*}} to %{{.*}}#0 : !hlfir.expr<1xi32>, !fir.ref> -// CHECK: fir.do_loop %{{.*}} = %[[C1]] to %[[C1]] step %[[C1]] unordered { -// CHECK: } -// CHECK: } From 322f16e6246ada7cd53e71e927ee68273e819f78 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 11:43:07 -0500 Subject: [PATCH 091/480] [AMDGPU][True16][MC] true16 for v_sat_pk_u8_i16 (#120634) Support true16 format for v_sat_pk_u8_i16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 69 +++++++++++-------- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +++++++++-------- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 21 ++++-- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 59 ++++++++-------- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 15 ++-- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 63 +++++++++-------- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 51 ++++++++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 6 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 6 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 9 +++ .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 3 + .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 3 + .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 3 + .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 55 +++++++++++---- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 +++++++++++---- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 ++++- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 46 +++++++++---- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 10 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 49 +++++++++---- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++++++++---- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +++- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 49 +++++++++---- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 46 +++++++++---- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 10 ++- 28 files changed, 572 insertions(+), 259 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1dd39be9e8d9c..bbb456ab739ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1047,7 +1047,7 @@ defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f1 defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; -defm V_SAT_PK_U8_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; +defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 4e4dc6647daeb..4448720e6f79f 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -3236,50 +3236,59 @@ v_rsq_f64 v[5:6], src_scc v_rsq_f64 v[254:255], 0xaf123456 // GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_sat_pk_u8_i16 v5, v1 -// GFX11: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, v1 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] -v_sat_pk_u8_i16 v5, v255 -// GFX11: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, v255 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] -v_sat_pk_u8_i16 v5, s1 -// GFX11: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, s1 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, s105 -// GFX11: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, s105 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, vcc_lo -// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, vcc_lo +// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, vcc_hi -// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, vcc_hi +// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, ttmp15 -// GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, ttmp15 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, m0 -// GFX11: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, m0 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, exec_lo -// GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, exec_lo +// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, exec_hi -// GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, exec_hi +// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, null -// GFX11: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, null +// GFX11: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, -1 -// GFX11: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, -1 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, 0.5 -// GFX11: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, 0.5 +// GFX11: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v5, src_scc -// GFX11: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +v_sat_pk_u8_i16 v5.l, src_scc +// GFX11: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] -v_sat_pk_u8_i16 v127, 0xfe0b -// GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sat_pk_u8_i16 v127.l, 0xfe0b +// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16 v127.l, 0.5 +// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc4,0xfe,0x7e] + +v_sat_pk_u8_i16 v5.h, src_scc +// GFX11: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f] + +v_sat_pk_u8_i16 v127.h, 0xfe0b +// GFX11: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f16 v5, v1 // GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 98e4b29b25666..da2a3615360a4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2522,47 +2522,56 @@ v_rsq_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_rsq_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_sat_pk_u8_i16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_sat_pk_u8_i16 v5.l, v1 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_sat_pk_u8_i16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_sat_pk_u8_i16 v5.l, v1 quad_perm:[0,1,2,3] +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_sat_pk_u8_i16 v5, v1 row_mirror -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_mirror +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_half_mirror -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_half_mirror +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shl:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shl:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shl:15 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shl:15 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shr:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shr:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_shr:15 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_shr:15 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_ror:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_ror:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_ror:15 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_ror:15 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_sat_pk_u8_i16 v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_sat_pk_u8_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_sat_pk_u8_i16 v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_sat_pk_u8_i16 v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16 v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30] + +v_sat_pk_u8_i16 v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01] + +v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13] + +v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_sin_f16 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index ab4606af2bb35..34cb2d097b7a7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -587,14 +587,23 @@ v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_rsq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16 v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] + +v_sat_pk_u8_i16 v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 4ae91340386b6..9c5693de3d8b1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -716,6 +716,24 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:30: error: invalid operand for instruction +v_sat_pk_u8_i16_e32 v199.h, v5.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sin_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 1d441720280ca..fa6ab407f87c7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1802,14 +1802,23 @@ v_rsq_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_rsq_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_rsq_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_sat_pk_u8_i16 v199, v5 -// GFX11: v_sat_pk_u8_i16_e64 v199, v5 ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00] +v_sat_pk_u8_i16 v199.h, v5 +// GFX11: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00] -v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] +v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] -v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] +v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] + +v_sat_pk_u8_i16 v199.l, v5 +// GFX11: v_sat_pk_u8_i16_e64 v199.l, v5 ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00] + +v_sat_pk_u8_i16 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v199.l, v5 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] v_sin_f16 v128, 0xfe0b // GFX11: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index f38ff6a2fdd7d..1bd1a5c5695bc 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2644,47 +2644,50 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 95407886ccba1..65af1c1829902 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -760,14 +760,17 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xae,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 3850f0254a7f1..1108887c26ed4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -3190,50 +3190,53 @@ v_rsq_f64_e64 v[5:6], -|src_scc| mul:4 v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_sat_pk_u8_i16_e64 v5, v1 -// GFX11: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, v1 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, v255 -// GFX11: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, v255 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, s1 -// GFX11: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, s1 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, s105 -// GFX11: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, s105 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, vcc_lo -// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, vcc_lo +// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, vcc_hi -// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, vcc_hi +// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, ttmp15 -// GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, ttmp15 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, m0 -// GFX11: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, m0 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, exec_lo -// GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, exec_lo +// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, exec_hi -// GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, exec_hi +// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, null -// GFX11: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, null +// GFX11: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, -1 -// GFX11: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, -1 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, 0.5 -// GFX11: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, 0.5 +// GFX11: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v5, src_scc -// GFX11: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5.l, src_scc +// GFX11: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] -v_sat_pk_u8_i16_e64 v255, 0xfe0b -// GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sat_pk_u8_i16_e64 v255.l, 0xfe0b +// GFX11: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16_e64 v255.h, 0xfe0b +// GFX11: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] v_sin_f16_e64 v5, v1 // GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index edd3b916f4e5f..086356fbca25a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -3301,49 +3301,70 @@ v_rsq_f64 v[254:255], 0xaf123456 // GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf] v_sat_pk_u8_i16 v5, v1 -// GFX12: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] v_sat_pk_u8_i16 v5, v255 -// GFX12: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] v_sat_pk_u8_i16 v5, s1 -// GFX12: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, s105 -// GFX12: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, vcc_lo -// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, vcc_hi -// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, ttmp15 -// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, m0 -// GFX12: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, exec_lo -// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, exec_hi -// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, null -// GFX12: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, -1 -// GFX12: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, 0.5 -// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v5, src_scc -// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] v_sat_pk_u8_i16 v127, 0xfe0b -// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +// GFX12-ASM: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +// GFX12-DIS: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16 v5.h, src_scc +// GFX12: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f] + +v_sat_pk_u8_i16 v127.h, 0xfe0b +// GFX12: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f16 v5, v1 // GFX12: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 56b42f19db38a..26e7162206aed 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2632,6 +2632,12 @@ v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13] + +v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] + v_sin_f16 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 09f3069114d4a..a54ae771fab40 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -619,6 +619,12 @@ v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] + v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 0ccad9c673079..01aa7a44bbc23 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -625,6 +625,24 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:30: error: invalid operand for instruction +v_sat_pk_u8_i16_e32 v199.h, v5 +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.h, v5 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5 +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sat_pk_u8_i16_e32 v199.l, v5 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sin_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index f220ec2b7d1e5..4c983af094561 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1771,6 +1771,15 @@ v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0] // GFX12: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] +v_sat_pk_u8_i16 v199.h, v5 +// GFX12: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00] + +v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05] + +v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0] +// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] + v_sin_f16 v128, 0xfe0b // GFX12: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 015619d31504b..ea4a58d9d0f7e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -3385,6 +3385,9 @@ v_sat_pk_u8_i16_e64 v5, src_scc v_sat_pk_u8_i16_e64 v255, 0xfe0b // GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sat_pk_u8_i16_e64 v255.h, 0xfe0b +// GFX12: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + v_sin_f16_e64 v5, v1 // GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 160bc3fc6afc7..a9b933e639abb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2548,6 +2548,9 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index c9ea7cdf1512e..af335f2e0b586 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -730,6 +730,9 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 61e529abf4455..f02f0206acd2f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -3220,49 +3220,74 @@ # GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xc5,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e] 0xff,0xc5,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e] 0x01,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e] 0x69,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e] 0x6a,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e] 0x6b,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e] 0x7b,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e] 0x7d,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e] 0x7e,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e] 0x7f,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e] 0x7c,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e] 0xc1,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e] 0xf0,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e] 0xfd,0xc4,0x0a,0x7e -# GFX11: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e] 0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0xf0,0xc4,0xfe,0x7e +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc4,0xfe,0x7e] +# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0.5 ; encoding: [0xf0,0xc4,0xfe,0x7e] + +0xfd,0xc4,0x0a,0x7f +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f] + +0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0xc1,0x0a,0x7e # GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index 1075a3eecd540..a4491e02abf05 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2525,46 +2525,72 @@ # GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30 -# GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] + +0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, s1, v176 ; encoding: [0x01,0x60,0x01,0x13] + +0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 051dd348e9a38..4e15731203168 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -470,10 +470,23 @@ # GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] + +0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v1, v187 ; encoding: [0x01,0x77,0x39,0x05] + +0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 2666b758344c6..f97c678e6a90a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2687,46 +2687,64 @@ # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index c19947c4bd6ff..3cad28d888202 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -727,10 +727,16 @@ # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 3df206ccf522e..8b2bc97c5de1f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -3204,49 +3204,68 @@ # GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index a1291b2e34f34..aa60378da9ab0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2661,46 +2661,68 @@ # GFX12: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30 -# GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30] + +0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, s1, v176 ; encoding: [0x01,0x60,0x01,0x13] + +0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 05008bfabc45a..99985e09d7432 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -483,10 +483,19 @@ # GFX12: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00] + +0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[1:2], v[187:188] ; encoding: [0x01,0x77,0x39,0x05] + +0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index bb9f607b6ece6..8ba4f58b787f5 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -3250,49 +3250,68 @@ # GFX12: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index be9f069322da8..98da7c8c54508 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2531,46 +2531,64 @@ # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 87115b962a808..8213237ada1e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -691,10 +691,16 @@ # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From e5acb167b72a6d2a6e29bcd29d6be57e15224c24 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 11:43:45 -0500 Subject: [PATCH 092/480] [AMDGPU][True16][MC] true16 for v_trunc_f16 (#120693) Support true16 format for v_trunc_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 41 +++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 44 ++++- .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 155 ++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 43 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 156 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 24 ++- 29 files changed, 1074 insertions(+), 465 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index bbb456ab739ab..92ebd0e10c8fd 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1042,7 +1042,7 @@ defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f1 defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; -defm V_TRUNC_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">; +defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 47777e3853e89..0d58afd1812de 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -62,6 +63,24 @@ define amdgpu_kernel void @trunc_f16( ; GFX11-NEXT: v_trunc_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: trunc_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -147,6 +166,28 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: trunc_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_trunc_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 4448720e6f79f..fe08042ae5c84 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -3533,50 +3533,65 @@ v_swaprel_b32 v5, v1 v_swaprel_b32 v255, v255 // GFX11: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f] -v_trunc_f16 v5, v1 -// GFX11: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v1.l +// GFX11: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e] -v_trunc_f16 v5, v127 -// GFX11: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v127.l +// GFX11: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e] -v_trunc_f16 v5, s1 -// GFX11: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s1 +// GFX11: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e] -v_trunc_f16 v5, s105 -// GFX11: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s105 +// GFX11: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_lo -// GFX11: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_lo +// GFX11: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_hi -// GFX11: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_hi +// GFX11: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] -v_trunc_f16 v5, ttmp15 -// GFX11: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, ttmp15 +// GFX11: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] -v_trunc_f16 v5, m0 -// GFX11: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, m0 +// GFX11: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_lo -// GFX11: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_lo +// GFX11: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_hi -// GFX11: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_hi +// GFX11: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] -v_trunc_f16 v5, null -// GFX11: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, null +// GFX11: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e] -v_trunc_f16 v5, -1 -// GFX11: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, -1 +// GFX11: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] -v_trunc_f16 v5, 0.5 -// GFX11: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, 0.5 +// GFX11: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] -v_trunc_f16 v5, src_scc -// GFX11: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, src_scc +// GFX11: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] -v_trunc_f16 v127, 0xfe0b -// GFX11: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v127.l, 0xfe0b +// GFX11: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_trunc_f16 v5.l, v1.h +// GFX11: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e] + +v_trunc_f16 v5.l, v127.h +// GFX11: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e] + +v_trunc_f16 v127.l, 0.5 +// GFX11: v_trunc_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xba,0xfe,0x7e] + +v_trunc_f16 v5.h, src_scc +// GFX11: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f] + +v_trunc_f16 v127.h, 0xfe0b +// GFX11: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_trunc_f32 v5, v1 // GFX11: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index da2a3615360a4..f5cf3fd390c7d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2741,47 +2741,56 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_trunc_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_trunc_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_trunc_f16 v5, v1 row_mirror -// GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_mirror +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_trunc_f16 v5, v1 row_half_mirror -// GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_half_mirror +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:1 -// GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:15 -// GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:15 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:1 -// GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:15 -// GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:15 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:1 -// GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:15 -// GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:15 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_trunc_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_trunc_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 34cb2d097b7a7..5a0ffd04bc5c1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -644,14 +644,23 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 9c5693de3d8b1..92882cb89e201 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s @@ -812,6 +812,12 @@ v_swap_b16_e32 v128.l, v0.l v_trunc_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_trunc_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -821,6 +827,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction +v_trunc_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -829,3 +853,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index fa6ab407f87c7..d97c8ed844dbb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s @@ -1952,69 +1952,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_trunc_f16 v128, 0xfe0b -// GFX11: v_trunc_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v128.h, 0xfe0b +// GFX11: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, -1 -// GFX11: v_trunc_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16 v128.l, 0xfe0b +// GFX11: v_trunc_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, 0.5 -// GFX11: v_trunc_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] +v_trunc_f16 v255.h, -1 +// GFX11: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16 v255, exec_hi -// GFX11: v_trunc_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16 v255.h, 0.5 +// GFX11: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00] -v_trunc_f16 v255, exec_lo -// GFX11: v_trunc_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_hi +// GFX11: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16 v255, m0 -// GFX11: v_trunc_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_lo +// GFX11: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16 v255, null -// GFX11: v_trunc_f16_e64 v255, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16 v255.h, m0 +// GFX11: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16 v255, s1 -// GFX11: v_trunc_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16 v255.h, null +// GFX11: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16 v255, s105 -// GFX11: v_trunc_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16 v255.h, s1 +// GFX11: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16 v255, src_scc -// GFX11: v_trunc_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] +v_trunc_f16 v255.h, s105 +// GFX11: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16 v255, ttmp15 -// GFX11: v_trunc_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16 v255.h, src_scc +// GFX11: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00] -v_trunc_f16 v255, v1 -// GFX11: v_trunc_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16 v255.h, ttmp15 +// GFX11: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16 v255.h, v1.h +// GFX11: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16 v255, v127 -// GFX11: v_trunc_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] +v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_trunc_f16 v255.h, v127.h +// GFX11: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00] -v_trunc_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_trunc_f16 v255, vcc_hi -// GFX11: v_trunc_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_trunc_f16 v255, vcc_lo -// GFX11: v_trunc_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16 v255.h, vcc_hi +// GFX11: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16 v5, v199 -// GFX11: v_trunc_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] +v_trunc_f16 v255.h, vcc_lo +// GFX11: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_trunc_f16 v255.l, -1 +// GFX11: v_trunc_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16 v255.l, 0.5 +// GFX11: v_trunc_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] -v_trunc_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_trunc_f16 v255.l, exec_hi +// GFX11: v_trunc_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v255.l, exec_lo +// GFX11: v_trunc_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f16 v255.l, m0 +// GFX11: v_trunc_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f16 v255.l, null +// GFX11: v_trunc_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s1 +// GFX11: v_trunc_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s105 +// GFX11: v_trunc_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f16 v255.l, src_scc +// GFX11: v_trunc_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] + +v_trunc_f16 v255.l, ttmp15 +// GFX11: v_trunc_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, v1.l +// GFX11: v_trunc_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, v127.l +// GFX11: v_trunc_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, vcc_hi +// GFX11: v_trunc_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, vcc_lo +// GFX11: v_trunc_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f16 v5.h, v199.h +// GFX11: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_trunc_f16 v5.l, v199.l +// GFX11: v_trunc_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 1bd1a5c5695bc..6176baf11c552 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2866,47 +2866,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 65af1c1829902..f3c8c8a69fbe5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -829,17 +829,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 1108887c26ed4..9020017c86106 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -3454,50 +3454,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_trunc_f16_e64 v5, v1 -// GFX11: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v1.l +// GFX11: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16_e64 v5, v255 -// GFX11: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v255.l +// GFX11: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] -v_trunc_f16_e64 v5, s1 -// GFX11: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s1 +// GFX11: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16_e64 v5, s105 -// GFX11: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s105 +// GFX11: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_lo -// GFX11: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_lo +// GFX11: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_hi -// GFX11: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_hi +// GFX11: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, ttmp15 -// GFX11: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, ttmp15 +// GFX11: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, m0 -// GFX11: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, m0 +// GFX11: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_lo -// GFX11: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_lo +// GFX11: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_hi -// GFX11: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_hi +// GFX11: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16_e64 v5, null -// GFX11: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, null +// GFX11: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16_e64 v5, -1 -// GFX11: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, -1 +// GFX11: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16_e64 v5, 0.5 mul:2 -// GFX11: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +v_trunc_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] -v_trunc_f16_e64 v5, src_scc mul:4 -// GFX11: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +v_trunc_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] -v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_trunc_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_trunc_f32_e64 v5, v1 // GFX11: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 086356fbca25a..b125821d1306e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -3603,50 +3603,62 @@ v_swaprel_b32 v5, v1 v_swaprel_b32 v255, v255 // GFX12: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f] -v_trunc_f16 v5, v1 -// GFX12: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v1.l +// GFX12: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e] -v_trunc_f16 v5, v127 -// GFX12: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] +v_trunc_f16 v5.l, v127.l +// GFX12: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e] -v_trunc_f16 v5, s1 -// GFX12: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s1 +// GFX12: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e] -v_trunc_f16 v5, s105 -// GFX12: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, s105 +// GFX12: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_lo -// GFX12: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_lo +// GFX12: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] -v_trunc_f16 v5, vcc_hi -// GFX12: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, vcc_hi +// GFX12: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] -v_trunc_f16 v5, ttmp15 -// GFX12: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, ttmp15 +// GFX12: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] -v_trunc_f16 v5, m0 -// GFX12: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, m0 +// GFX12: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_lo -// GFX12: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_lo +// GFX12: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] -v_trunc_f16 v5, exec_hi -// GFX12: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, exec_hi +// GFX12: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] -v_trunc_f16 v5, null -// GFX12: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, null +// GFX12: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e] -v_trunc_f16 v5, -1 -// GFX12: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, -1 +// GFX12: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] -v_trunc_f16 v5, 0.5 -// GFX12: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, 0.5 +// GFX12: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] -v_trunc_f16 v5, src_scc -// GFX12: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +v_trunc_f16 v5.l, src_scc +// GFX12: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] -v_trunc_f16 v127, 0xfe0b -// GFX12: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v127.l, 0xfe0b +// GFX12: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_trunc_f16 v5.l, v1.h +// GFX12: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e] + +v_trunc_f16 v5.l, v127.h +// GFX12: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e] + +v_trunc_f16 v5.h, src_scc +// GFX12: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f] + +v_trunc_f16 v127.h, 0xfe0b +// GFX12: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_trunc_f32 v5, v1 // GFX12: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 26e7162206aed..a625326c1dae4 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2806,47 +2806,53 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_trunc_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_trunc_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_trunc_f16 v5, v1 row_mirror -// GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_mirror +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_trunc_f16 v5, v1 row_half_mirror -// GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_half_mirror +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:1 -// GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_trunc_f16 v5, v1 row_shl:15 -// GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shl:15 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:1 -// GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_trunc_f16 v5, v1 row_shr:15 -// GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_shr:15 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:1 -// GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_trunc_f16 v5, v1 row_ror:15 -// GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_ror:15 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_trunc_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index a54ae771fab40..9281d6fb16ce8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -661,14 +661,20 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 01aa7a44bbc23..33a5dded095c7 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 ; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s @@ -703,6 +704,12 @@ v_swap_b16_e32 v128.l, v0.l v_trunc_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_trunc_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -712,6 +719,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction +v_trunc_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_trunc_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -720,3 +745,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 4c983af094561..03519d43c49a9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 --implicit-check-not=_e32 %s v_ceil_f16 v128, 0xfe0b @@ -1912,68 +1912,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_trunc_f16 v128, 0xfe0b -// GFX12: v_trunc_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_trunc_f16 v128.h, 0xfe0b +// GFX12: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, -1 -// GFX12: v_trunc_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16 v128.l, 0xfe0b +// GFX12: v_trunc_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_trunc_f16 v255, 0.5 -// GFX12: v_trunc_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] +v_trunc_f16 v255.h, -1 +// GFX12: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16 v255, exec_hi -// GFX12: v_trunc_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16 v255.h, 0.5 +// GFX12: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00] -v_trunc_f16 v255, exec_lo -// GFX12: v_trunc_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_hi +// GFX12: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16 v255, m0 -// GFX12: v_trunc_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16 v255.h, exec_lo +// GFX12: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16 v255, null -// GFX12: v_trunc_f16_e64 v255, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16 v255.h, m0 +// GFX12: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16 v255, s1 -// GFX12: v_trunc_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16 v255.h, null +// GFX12: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16 v255, s105 -// GFX12: v_trunc_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16 v255.h, s1 +// GFX12: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16 v255, src_scc -// GFX12: v_trunc_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] +v_trunc_f16 v255.h, s105 +// GFX12: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16 v255, ttmp15 -// GFX12: v_trunc_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16 v255.h, src_scc +// GFX12: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00] -v_trunc_f16 v255, v1 -// GFX12: v_trunc_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16 v255.h, ttmp15 +// GFX12: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16 v255.h, v1.h +// GFX12: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16 v255, v127 -// GFX12: v_trunc_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] +v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_trunc_f16 v255.h, v127.h +// GFX12: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00] -v_trunc_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_trunc_f16 v255, vcc_hi -// GFX12: v_trunc_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_trunc_f16 v255, vcc_lo -// GFX12: v_trunc_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16 v255.h, vcc_hi +// GFX12: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16 v5, v199 -// GFX12: v_trunc_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] +v_trunc_f16 v255.h, vcc_lo +// GFX12: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_trunc_f16 v255.l, -1 +// GFX12: v_trunc_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_trunc_f16 v255.l, 0.5 +// GFX12: v_trunc_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00] + +v_trunc_f16 v255.l, exec_hi +// GFX12: v_trunc_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f16 v255.l, exec_lo +// GFX12: v_trunc_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f16 v255.l, m0 +// GFX12: v_trunc_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f16 v255.l, null +// GFX12: v_trunc_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s1 +// GFX12: v_trunc_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f16 v255.l, s105 +// GFX12: v_trunc_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f16 v255.l, src_scc +// GFX12: v_trunc_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00] + +v_trunc_f16 v255.l, ttmp15 +// GFX12: v_trunc_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, v1.l +// GFX12: v_trunc_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, v127.l +// GFX12: v_trunc_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00] + +v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_trunc_f16 v255.l, vcc_hi +// GFX12: v_trunc_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f16 v255.l, vcc_lo +// GFX12: v_trunc_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f16 v5.h, v199.h +// GFX12: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_trunc_f16 v5.l, v199.l +// GFX12: v_trunc_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00] + +v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index ea4a58d9d0f7e..e2fe08ddc8b06 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -3604,50 +3604,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_trunc_f16_e64 v5, v1 -// GFX12: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v1.l +// GFX12: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] -v_trunc_f16_e64 v5, v255 -// GFX12: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +v_trunc_f16_e64 v5.l, v255.l +// GFX12: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] -v_trunc_f16_e64 v5, s1 -// GFX12: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s1 +// GFX12: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] -v_trunc_f16_e64 v5, s105 -// GFX12: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, s105 +// GFX12: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_lo -// GFX12: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_lo +// GFX12: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] -v_trunc_f16_e64 v5, vcc_hi -// GFX12: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, vcc_hi +// GFX12: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, ttmp15 -// GFX12: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, ttmp15 +// GFX12: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] -v_trunc_f16_e64 v5, m0 -// GFX12: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, m0 +// GFX12: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_lo -// GFX12: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_lo +// GFX12: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] -v_trunc_f16_e64 v5, exec_hi -// GFX12: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, exec_hi +// GFX12: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] -v_trunc_f16_e64 v5, null -// GFX12: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, null +// GFX12: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] -v_trunc_f16_e64 v5, -1 -// GFX12: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +v_trunc_f16_e64 v5.l, -1 +// GFX12: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] -v_trunc_f16_e64 v5, 0.5 mul:2 -// GFX12: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +v_trunc_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] -v_trunc_f16_e64 v5, src_scc mul:4 -// GFX12: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +v_trunc_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] -v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_trunc_f16_e64 v5.h, v1.h +// GFX12: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16_e64 v5.l, v255.h +// GFX12: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_trunc_f32_e64 v5, v1 // GFX12: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index a9b933e639abb..3fff2749e6e99 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2719,47 +2719,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_trunc_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index af335f2e0b586..e4ae0ad655518 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -781,17 +781,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index f02f0206acd2f..8cf2c2b4f2d1e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -3545,49 +3545,82 @@ # GFX11: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f] 0x01,0xbb,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e] 0x7f,0xbb,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e] 0x01,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e] 0x69,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e] 0x6a,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e] 0x6b,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e] 0x7b,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e] 0x7d,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e] 0x7e,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e] 0x7f,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e] 0x7c,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e] 0xc1,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e] 0xf0,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e] 0xfd,0xba,0x0a,0x7e -# GFX11: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +# GFX11-REAL16: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e] 0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xbb,0x0a,0x7e +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbb,0x0a,0x7e] + +0xff,0xbb,0x0a,0x7e +# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbb,0x0a,0x7e] + +0xf0,0xba,0xfe,0x7e +# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xba,0xfe,0x7e] +# GFX11-FAKE16: v_trunc_f16_e32 v127, 0.5 ; encoding: [0xf0,0xba,0xfe,0x7e] + +0xfd,0xba,0x0a,0x7f +# GFX11-REAL16: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f] + +0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x43,0x0a,0x7e # GFX11: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index a4491e02abf05..b9a499549d12c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2775,46 +2775,72 @@ # GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 4e15731203168..80c739a98f65f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -522,10 +522,23 @@ # GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index f97c678e6a90a..fd84ed734fb31 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2941,46 +2941,72 @@ # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 3cad28d888202..0edbff63d60ed 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -803,16 +803,32 @@ # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 8b2bc97c5de1f..0406d78078305 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -3499,49 +3499,76 @@ # GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index aa60378da9ab0..22ae18815a522 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2915,46 +2915,68 @@ # GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 99985e09d7432..bfb84c6cdff39 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -531,10 +531,19 @@ # GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 8ba4f58b787f5..e27469230a15f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -3545,49 +3545,76 @@ # GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index 98da7c8c54508..bc957576b19b6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2773,46 +2773,72 @@ # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 8213237ada1e2..989824315b2d2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -755,16 +755,32 @@ # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From c50370c67afddf557ba30d58143b30ffb7203935 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Sat, 4 Jan 2025 00:44:57 +0800 Subject: [PATCH 093/480] [SLP] NFC. Use InstructionsState::valid if users just want to know whether VL has same opcode. (#120217) Add assert for InstructionsState::getOpcode. Use InstructionsState::getOpcode only when necessary. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 129 +++++++++--------- 1 file changed, 67 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f52ddfda5e64c..c4582df89213d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -816,27 +816,34 @@ class InstructionsState { Instruction *AltOp = nullptr; public: - Instruction *getMainOp() const { return MainOp; } + Instruction *getMainOp() const { + assert(valid() && "InstructionsState is invalid."); + return MainOp; + } - Instruction *getAltOp() const { return AltOp; } + Instruction *getAltOp() const { + assert(valid() && "InstructionsState is invalid."); + return AltOp; + } /// The main/alternate opcodes for the list of instructions. - unsigned getOpcode() const { - return MainOp ? MainOp->getOpcode() : 0; - } + unsigned getOpcode() const { return getMainOp()->getOpcode(); } - unsigned getAltOpcode() const { - return AltOp ? AltOp->getOpcode() : 0; - } + unsigned getAltOpcode() const { return getAltOp()->getOpcode(); } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return AltOp != MainOp; } + bool isAltShuffle() const { return getMainOp() != getAltOp(); } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } + /// Checks if the current state is valid, i.e. has non-null MainOp + bool valid() const { return MainOp && AltOp; } + + explicit operator bool() const { return valid(); } + InstructionsState() = delete; InstructionsState(Instruction *MainOp, Instruction *AltOp) : MainOp(MainOp), AltOp(AltOp) {} @@ -869,8 +876,8 @@ static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, (!isa(BaseOp0) && !isa(Op0) && !isa(BaseOp1) && !isa(Op1)) || BaseOp0 == Op0 || BaseOp1 == Op1 || - getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || - getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); + getSameOpcode({BaseOp0, Op0}, TLI) || + getSameOpcode({BaseOp1, Op1}, TLI); } /// \returns true if a compare instruction \p CI has similar "look" and @@ -1847,7 +1854,7 @@ class BoUpSLP { InstructionsState S = getSameOpcode(Ops, TLI); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. - if (S.getOpcode() && + if (S && (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() || !S.isAltShuffle()) && all_of(Ops, [&S](Value *V) { @@ -2382,7 +2389,7 @@ class BoUpSLP { // Use Boyer-Moore majority voting for finding the majority opcode and // the number of times it occurs. if (auto *I = dyn_cast(OpData.V)) { - if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || + if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) || I->getParent() != Parent) { if (NumOpsWithSameOpcodeParent == 0) { NumOpsWithSameOpcodeParent = 1; @@ -2501,8 +2508,7 @@ class BoUpSLP { // 2.1. If we have only 2 lanes, need to check that value in the // next lane does not build same opcode sequence. (Lns == 2 && - !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) - .getOpcode() && + !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) && isa(Data.V)))) || // 3. The operand in the current lane is loop invariant (can be // hoisted out) and another operand is also a loop invariant @@ -2511,7 +2517,7 @@ class BoUpSLP { // FIXME: need to teach the cost model about this case for better // estimation. (IsInvariant && !isa(Data.V) && - !getSameOpcode({Op, Data.V}, TLI).getOpcode() && + !getSameOpcode({Op, Data.V}, TLI) && L->isLoopInvariant(Data.V))) { FoundCandidate = true; Data.IsUsed = Data.V == Op; @@ -2541,7 +2547,7 @@ class BoUpSLP { return true; Value *OpILn = getValue(OpI, Ln); return (L && L->isLoopInvariant(OpILn)) || - (getSameOpcode({Op, OpILn}, TLI).getOpcode() && + (getSameOpcode({Op, OpILn}, TLI) && allSameBlock({Op, OpILn})); })) return true; @@ -2698,7 +2704,7 @@ class BoUpSLP { OperandData &AltOp = getData(OpIdx, Lane); InstructionsState OpS = getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); - if (OpS.getOpcode() && OpS.isAltShuffle()) + if (OpS && OpS.isAltShuffle()) MainAltOps[OpIdx].push_back(AltOp.V); } } @@ -3400,6 +3406,7 @@ class BoUpSLP { } void setOperations(const InstructionsState &S) { + assert(S && "InstructionsState is invalid."); MainOp = S.getMainOp(); AltOp = S.getAltOp(); } @@ -3600,7 +3607,7 @@ class BoUpSLP { "Need to vectorize gather entry?"); // Gathered loads still gathered? Do not create entry, use the original one. if (GatheredLoadsEntriesFirst.has_value() && - EntryState == TreeEntry::NeedToGather && + EntryState == TreeEntry::NeedToGather && S && S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && !UserTreeIdx.UserTE) return nullptr; @@ -3618,7 +3625,8 @@ class BoUpSLP { ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { Last->Scalars.assign(VL.begin(), VL.end()); - Last->setOperations(S); + if (S) + Last->setOperations(S); } else { // Reorder scalars and build final mask. Last->Scalars.assign(VL.size(), nullptr); @@ -3629,7 +3637,8 @@ class BoUpSLP { return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); - Last->setOperations(S); + if (S) + Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (!Last->isGather()) { @@ -4774,8 +4783,7 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, (!GEP2 || isConstant(GEP2->getOperand(1)))) || !CompareOpcodes || (GEP1 && GEP2 && - getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) - .getOpcode())); + getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI))); } /// Calculates minimal alignment as a common alignment. @@ -7500,7 +7508,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, [&](ArrayRef Op) { if (allConstant(Op) || (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) && - getSameOpcode(Op, *TLI).getMainOp())) + getSameOpcode(Op, *TLI))) return false; DenseMap Uniques; for (Value *V : Op) { @@ -8071,15 +8079,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no // place to insert a shuffle if we need to, so just avoid that issue. - if (S.getMainOp() && - isa(S.getMainOp()->getParent()->getTerminator())) { + if (S && isa(S.getMainOp()->getParent()->getTerminator())) { LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } // Check if this is a duplicate of another entry. - if (S.getOpcode()) { + if (S) { if (TreeEntry *E = getTreeEntry(S.getMainOp())) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n"); @@ -8140,13 +8147,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // a load), in which case peek through to include it in the tree, without // ballooning over-budget. if (Depth >= RecursionMaxDepth && - !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 && + !(S && !S.isAltShuffle() && VL.size() >= 4 && (match(S.getMainOp(), m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { return match(I, m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && - cast(I)->getOpcode() == - S.getMainOp()->getOpcode(); + cast(I)->getOpcode() == S.getOpcode(); })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) @@ -8156,7 +8162,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't handle scalable vectors - if (S.getOpcode() == Instruction::ExtractElement && + if (S && S.getOpcode() == Instruction::ExtractElement && isa( cast(S.getMainOp())->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); @@ -8180,7 +8186,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // vectorize. auto &&NotProfitableForVectorization = [&S, this, Depth](ArrayRef VL) { - if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) + if (!S || !S.isAltShuffle() || VL.size() > 2) return false; if (VectorizableTree.size() < MinTreeSize) return false; @@ -8235,7 +8241,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; - bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL); + bool AreAllSameBlock = S && allSameBlock(VL); bool AreScatterAllGEPSameBlock = (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && VL.size() > 2 && @@ -8252,8 +8258,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, SortedIndices)); bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; - if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) || - (isa_and_present( + if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) || + (S && + isa( S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { @@ -8265,7 +8272,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't vectorize ephemeral values. - if (S.getOpcode() && !EphValues.empty()) { + if (S && !EphValues.empty()) { for (Value *V : VL) { if (EphValues.count(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V @@ -8324,7 +8331,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Instruction *VL0 = S.getMainOp(); BB = VL0->getParent(); - if (S.getMainOp() && + if (S && (BB->isEHPad() || isa_and_nonnull(BB->getTerminator()) || !DT->isReachableFromEntry(BB))) { // Don't go into unreachable blocks. They may contain instructions with @@ -8378,8 +8385,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) { // Postpone PHI nodes creation SmallVector PHIOps; @@ -8388,7 +8395,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Op.empty()) continue; InstructionsState S = getSameOpcode(Op, *TLI); - if (S.getOpcode() != Instruction::PHI || S.isAltShuffle()) + if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle()) buildTree_rec(Op, Depth + 1, {TE, I}); else PHIOps.push_back(I); @@ -9771,7 +9778,7 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) || + if (!S || S.isAltShuffle() || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && areKnownNonVectorizableLoads(Slice)) || (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) @@ -11086,7 +11093,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (const TreeEntry *OpTE = getTreeEntry(V)) return getCastContextHint(*OpTE); InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); - if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) + if (SrcState && SrcState.getOpcode() == Instruction::Load && + !SrcState.isAltShuffle()) return TTI::CastContextHint::GatherScatter; return TTI::CastContextHint::None; }; @@ -13265,7 +13273,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Value *In1 = PHI1->getIncomingValue(I); if (isConstant(In) && isConstant(In1)) continue; - if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + if (!getSameOpcode({In, In1}, *TLI)) return false; if (cast(In)->getParent() != cast(In1)->getParent()) @@ -13293,7 +13301,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (It != UsedValuesEntry.end()) UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && - getSameOpcode({V, V1}, *TLI).getOpcode() && + getSameOpcode({V, V1}, *TLI) && cast(V)->getParent() == cast(V1)->getParent() && (!isa(V1) || AreCompatiblePHIs(V, V1)); @@ -14560,12 +14568,12 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, ArrayRef VL = E->getOperand(NodeIdx); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. - if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { + if (!S && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, IsaPred); if (It != VL.end()) S = getSameOpcode(*It, *TLI); } - if (!S.getOpcode()) + if (!S) return nullptr; auto CheckSameVE = [&](const TreeEntry *VE) { return VE->isSame(VL) && @@ -18546,8 +18554,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(), ValOps.size()) || (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1)); - if ((!IsAllowedSize && S.getOpcode() && - S.getOpcode() != Instruction::Load && + if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load && (!S.getMainOp()->isSafeToRemove() || any_of(ValOps.getArrayRef(), [&](Value *V) { @@ -18557,8 +18564,8 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, return !Stores.contains(U); })); }))) || - (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) { - Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2; + (ValOps.size() > Chain.size() / 2 && !S)) { + Size = (!IsAllowedSize && S) ? 1 : 2; return false; } } @@ -18581,7 +18588,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, R.computeMinimumValueSizes(); Size = R.getCanonicalGraphSize(); - if (S.getOpcode() == Instruction::Load) + if (S && S.getOpcode() == Instruction::Load) Size = 2; // cut off masked gather small trees InstructionCost Cost = R.getTreeCost(); @@ -19082,7 +19089,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL, *TLI); - if (!S.getOpcode()) + if (!S) return false; Instruction *I0 = S.getMainOp(); @@ -19906,16 +19913,16 @@ class HorizontalReduction { // Also check if the instruction was folded to constant/other value. auto *Inst = dyn_cast(RdxVal); if ((Inst && isVectorLikeInstWithConstOps(Inst) && - (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) || - (S.getOpcode() && !Inst)) + (!S || !S.isOpcodeOrAlt(Inst))) || + (S && !Inst)) continue; Candidates.push_back(RdxVal); TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); } bool ShuffledExtracts = false; // Try to handle shuffled extractelements. - if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && - I + 1 < E) { + if (S && S.getOpcode() == Instruction::ExtractElement && + !S.isAltShuffle() && I + 1 < E) { SmallVector CommonCandidates(Candidates); for (Value *RV : ReducedVals[I + 1]) { Value *RdxVal = TrackedVals.at(RV); @@ -21310,7 +21317,7 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); } InstructionsState S = getSameOpcode({I1, I2}, TLI); - if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle())) + if (S && (IsCompatibility || !S.isAltShuffle())) continue; if (IsCompatibility) return false; @@ -21468,7 +21475,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode() && !S.isAltShuffle()) + if (S && !S.isAltShuffle()) continue; return I1->getOpcode() < I2->getOpcode(); } @@ -21531,8 +21538,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return false; if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode()) + if (getSameOpcode({I1, I2}, *TLI)) continue; return false; } @@ -21904,8 +21910,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (auto *I2 = dyn_cast(V2->getValueOperand())) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - return S.getOpcode() > 0; + return getSameOpcode({I1, I2}, *TLI).valid(); } if (isa(V1->getValueOperand()) && isa(V2->getValueOperand())) From c744ed53a84f90598751cdcda4c68900113587ab Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 11:58:04 -0500 Subject: [PATCH 094/480] [AMDGPU][True16][MC] disable incorrect VOPC t16 instruction (#120271) The current VOPC t16 instructions are not implemented with the correct t16 pseudo. Thus the current t16/fake16 instructions are all in fake16 format. The plan is to remove the incorrect t16 instructions and refactor them. The first step is to remove them in this patch. The next step will be updating the t16/fake16 pseudo to the correct format and add back true16 instruction one by one in the upcoming patches. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 3 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 4 +- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 132 +++++------------- .../inst-select-amdgcn.fcmp.constants.w32.mir | 8 +- .../inst-select-amdgcn.fcmp.constants.w64.mir | 8 +- 5 files changed, 47 insertions(+), 108 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3be865f03df1f..041b9b4d66f63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1125,8 +1125,9 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, unsigned FakeS16Opc, unsigned S32Opc, unsigned S64Opc) { if (Size == 16) + // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code return ST.hasTrue16BitInsts() - ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc + ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc : S16Opc; if (Size == 32) return S32Opc; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 789ce8815cf80..e388efe73cddb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2674,8 +2674,8 @@ let OtherPredicates = [NotHasTrue16BitInsts] in { } // end OtherPredicates = [NotHasTrue16BitInsts] let OtherPredicates = [HasTrue16BitInsts] in { - def : FPToI1Pat; - def : FPToI1Pat; + def : FPToI1Pat; + def : FPToI1Pat; } // end OtherPredicates = [HasTrue16BitInsts] def : FPToI1Pat; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 9bf043ea334fe..8589d598f5870 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1130,20 +1130,20 @@ defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; -let OtherPredicates = [HasTrue16BitInsts] in { -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -defm : ICMP_Pattern ; -} // End OtherPredicates = [HasTrue16BitInsts] - -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +defm : ICMP_Pattern ; +} // End True16Predicate = UseFakeTrue16Insts + +let True16Predicate = NotHasTrue16BitInsts in { defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; @@ -1154,7 +1154,7 @@ defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; -} // End OtherPredicates = [NotHasTrue16BitInsts] +} // End True16Predicate = NotHasTrue16BitInsts multiclass FCMP_Pattern { let WaveSizePredicate = isWave64 in @@ -1215,25 +1215,25 @@ defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; -let OtherPredicates = [HasTrue16BitInsts] in { -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; - -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -defm : FCMP_Pattern ; -} // End OtherPredicates = [HasTrue16BitInsts] - -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = UseFakeTrue16Insts in { +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +} // End True16Predicate = UseFakeTrue16Insts + +let True16Predicate = NotHasTrue16BitInsts in { defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; @@ -1249,7 +1249,7 @@ defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; -} // End OtherPredicates = [NotHasTrue16BitInsts] +} // End True16Predicate = NotHasTrue16BitInsts //===----------------------------------------------------------------------===// // DPP Encodings @@ -1707,23 +1707,6 @@ multiclass VOPCX_Real_t16_gfx11_gfx12 op, string asm_name, VOPCX_Real_t16, VOPCX_Real_t16; -defm V_CMP_F_F16_t16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; -defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; -defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; -defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">; -defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">; -defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">; -defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">; -defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">; -defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">; -defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">; -defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">; -defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">; -defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">; -defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">; -defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">; -defm V_CMP_T_F16_t16 : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_t16", "v_cmp_tru_f16">; - defm V_CMP_F_F16_fake16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; defm V_CMP_LT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; defm V_CMP_EQ_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; @@ -1759,19 +1742,6 @@ defm V_CMP_NLT_F32 : VOPC_Real_gfx11_gfx12<0x01e>; defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">; defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">; -defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; -defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; -defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; -defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">; -defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">; -defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">; -defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">; -defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">; -defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">; -defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">; -defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">; -defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">; - defm V_CMP_LT_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; defm V_CMP_EQ_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; defm V_CMP_LE_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; @@ -1819,28 +1789,10 @@ defm V_CMP_NE_U64 : VOPC_Real_gfx11_gfx12<0x05d>; defm V_CMP_GE_U64 : VOPC_Real_gfx11_gfx12<0x05e>; defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>; -defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>; defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; -defm V_CMPX_F_F16_t16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; -defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; -defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; -defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; -defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; -defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; -defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; -defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; -defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; -defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; -defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; -defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; -defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; -defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; -defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; -defm V_CMPX_T_F16_t16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; - defm V_CMPX_F_F16_fake16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; defm V_CMPX_LT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; defm V_CMPX_EQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; @@ -1892,19 +1844,6 @@ defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11_gfx12<0x0ad>; defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11_gfx12<0x0ae>; defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">; -defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; -defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; -defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; -defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; -defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; -defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; -defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; -defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; -defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; -defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; -defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; -defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; - defm V_CMPX_LT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; defm V_CMPX_EQ_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; defm V_CMPX_LE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; @@ -1951,7 +1890,6 @@ defm V_CMPX_GT_U64 : VOPCX_Real_gfx11_gfx12<0x0dc>; defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>; defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>; defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; -defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>; defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir index 55015c6d13d8a..cdb67caea12cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir @@ -20,8 +20,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -58,8 +58,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir index 4241f945a87d5..ed811d37c3d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir @@ -20,8 +20,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -58,8 +58,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 From fa56e8bb6451bdf24be6c2a8737dab5fe6a2039c Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 3 Jan 2025 18:01:01 +0100 Subject: [PATCH 095/480] [OpenMP][MLIR] Fix threadprivate lowering when compiling for target when target operations are in use (#119310) Currently the compiler will ICE in programs like the following on the device lowering pass: ``` program main implicit none type i1_t integer :: val(1000) end type i1_t integer :: i type(i1_t), pointer :: newi1 type(i1_t), pointer :: tab=>null() integer, dimension(:), pointer :: tabval !$omp THREADPRIVATE(tab) allocate(newi1) tab=>newi1 tab%val(:)=1 tabval=>tab%val !$omp target teams distribute parallel do do i = 1, 1000 tabval(i) = i end do !$omp end target teams distribute parallel do end program main ``` This is due to the fact that THREADPRIVATE returns a result operation, and this operation can actually be used by other LLVM dialect (or other dialect) operations. However, we currently skip the lowering of threadprivate, so we effectively never generate and bind an LLVM-IR result to the threadprivate operation result. So when we later go on to lower dependent LLVM dialect operations, we are missing the required LLVM-IR result, try to access and use it and then ICE. The fix in this particular PR is to allow compilation of threadprivate for device as well as host, and simply treat the device compilation as a no-op, binding the LLVM-IR result of threadprivate with no alterations and binding it, which will allow the rest of the compilation to proceed, where we'll eventually discard the host segment in any case. The other possible solution to this I can think of, is doing something similar to Flang's passes that occur prior to CodeGen to the LLVM dialect, where they erase/no-op certain unrequired operations or transform them to lower level series of operations. And we would erase/no-op threadprivate on device as we'd never have these in target regions. The main issues I can see with this are that we currently do not specialise this stage based on wether we're compiling for device or host, so it's setting a precedent and adding another point of having to understand the separation between target and host compilation. I am also not sure we'd necessarily want to enforce this at a dialect level incase someone else wishes to add a different lowering flow or translation flow. Another possible issue is that a target operation we have/utilise would depend on the result of threadprivate, meaning we'd not be allowed to entirely erase/no-op it, I am not sure of any situations where this may be an issue currently though. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 38 +++++++++++++------ ...ptarget-threadprivate-device-lowering.mlir | 30 +++++++++++++++ .../fortran/target-with-threadprivate.f90 | 37 ++++++++++++++++++ 3 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir create mode 100644 offload/test/offloading/fortran/target-with-threadprivate.f90 diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index ce129417fc5b2..87cb7f03fec6a 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2588,6 +2588,7 @@ static LogicalResult convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); auto threadprivateOp = cast(opInst); if (failed(checkImplementationStatus(opInst))) @@ -2595,6 +2596,10 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, Value symAddr = threadprivateOp.getSymAddr(); auto *symOp = symAddr.getDefiningOp(); + + if (auto asCast = dyn_cast(symOp)) + symOp = asCast.getOperand().getDefiningOp(); + if (!isa(symOp)) return opInst.emitError("Addressing symbol not found"); LLVM::AddressOfOp addressOfOp = dyn_cast(symOp); @@ -2602,17 +2607,20 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::GlobalOp global = addressOfOp.getGlobal(moduleTranslation.symbolTable()); llvm::GlobalValue *globalValue = moduleTranslation.lookupGlobal(global); - llvm::Type *type = globalValue->getValueType(); - llvm::TypeSize typeSize = - builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize( - type); - llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue()); - llvm::StringRef suffix = llvm::StringRef(".cache", 6); - std::string cacheName = (Twine(global.getSymName()).concat(suffix)).str(); - llvm::Value *callInst = - moduleTranslation.getOpenMPBuilder()->createCachedThreadPrivate( - ompLoc, globalValue, size, cacheName); - moduleTranslation.mapValue(opInst.getResult(0), callInst); + + if (!ompBuilder->Config.isTargetDevice()) { + llvm::Type *type = globalValue->getValueType(); + llvm::TypeSize typeSize = + builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize( + type); + llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue()); + llvm::Value *callInst = ompBuilder->createCachedThreadPrivate( + ompLoc, globalValue, size, global.getSymName() + ".cache"); + moduleTranslation.mapValue(opInst.getResult(0), callInst); + } else { + moduleTranslation.mapValue(opInst.getResult(0), globalValue); + } + return success(); } @@ -4212,6 +4220,14 @@ static bool isTargetDeviceOp(Operation *op) { if (op->getParentOfType()) return true; + // Certain operations return results, and whether utilised in host or + // target there is a chance an LLVM Dialect operation depends on it + // by taking it in as an operand, so we must always lower these in + // some manner or result in an ICE (whether they end up in a no-op + // or otherwise). + if (mlir::isa(op)) + return true; + if (auto parentFn = op->getParentOfType()) if (auto declareTargetIface = llvm::dyn_cast( diff --git a/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir new file mode 100644 index 0000000000000..279ecb3f8e998 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Not intended to be a functional example, the aim of this test is to verify +// omp.threadprivate does not crash on lowering during the OpenMP target device +// pass when used in conjunction with target code in the same module. + +module attributes {omp.is_target_device = true } { + llvm.func @func() attributes {omp.declare_target = #omp.declaretarget} { + %0 = llvm.mlir.addressof @_QFEpointer2 : !llvm.ptr + %1 = omp.threadprivate %0 : !llvm.ptr -> !llvm.ptr + %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(implicit, to) capture(ByRef) -> !llvm.ptr + omp.target map_entries(%2 -> %arg0 : !llvm.ptr) { + %3 = llvm.mlir.constant(1 : i32) : i32 + %4 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %3, %4 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } + llvm.mlir.global internal @_QFEpointer2() {addr_space = 0 : i32} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.return %0 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + } +} + +// CHECK: define weak_odr protected void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) { +// CHECK: %[[ALLOCA:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[ARG1]], ptr %[[ALLOCA]], align 8 +// CHECK: %[[LOAD_ALLOCA:.*]] = load ptr, ptr %[[ALLOCA]], align 8 +// CHECK: store i32 1, ptr %[[LOAD_ALLOCA]], align 4 diff --git a/offload/test/offloading/fortran/target-with-threadprivate.f90 b/offload/test/offloading/fortran/target-with-threadprivate.f90 new file mode 100644 index 0000000000000..10c7cecf08412 --- /dev/null +++ b/offload/test/offloading/fortran/target-with-threadprivate.f90 @@ -0,0 +1,37 @@ +! Basic offloading test that makes sure we can use the predominantly host +! pragma threadprivate in the same program as target code +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + + type dtype + integer :: val(10) + end type dtype + + integer :: i + type(dtype), pointer :: pointer1 + type(dtype), pointer :: pointer2=>null() + integer, dimension(:), pointer :: data_pointer + +!$omp threadprivate(pointer2) + +nullify(pointer1) +allocate(pointer1) + +pointer2=>pointer1 +pointer2%val(:)=1 +data_pointer=>pointer2%val + +!$omp target + do i = 1, 10 + data_pointer(i) = i + end do +!$omp end target + +print *, data_pointer + +end program main + +! CHECK: 1 2 3 4 5 6 7 8 9 10 From 1cade8699719c934a8debb7bef9fdc3ff11e9602 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Fri, 3 Jan 2025 18:02:59 +0100 Subject: [PATCH 096/480] [mlir][arith] Fold `(a * b) / b -> a` (#121534) If overflow flags allow it. Alive2 check: https://alive2.llvm.org/ce/z/5XWjWE --- mlir/lib/Dialect/Arith/IR/ArithOps.cpp | 24 +++++++++ mlir/test/Dialect/Arith/canonicalize.mlir | 64 +++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index d8b314a3fa43c..e016a6e16e59f 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -580,11 +580,31 @@ void arith::MulUIExtendedOp::getCanonicalizationPatterns( // DivUIOp //===----------------------------------------------------------------------===// +/// Fold `(a * b) / b -> a` +static Value foldDivMul(Value lhs, Value rhs, + arith::IntegerOverflowFlags ovfFlags) { + auto mul = lhs.getDefiningOp(); + if (!mul || !bitEnumContainsAll(mul.getOverflowFlags(), ovfFlags)) + return {}; + + if (mul.getLhs() == rhs) + return mul.getRhs(); + + if (mul.getRhs() == rhs) + return mul.getLhs(); + + return {}; +} + OpFoldResult arith::DivUIOp::fold(FoldAdaptor adaptor) { // divui (x, 1) -> x. if (matchPattern(adaptor.getRhs(), m_One())) return getLhs(); + // (a * b) / b -> a + if (Value val = foldDivMul(getLhs(), getRhs(), IntegerOverflowFlags::nuw)) + return val; + // Don't fold if it would require a division by zero. bool div0 = false; auto result = constFoldBinaryOp(adaptor.getOperands(), @@ -621,6 +641,10 @@ OpFoldResult arith::DivSIOp::fold(FoldAdaptor adaptor) { if (matchPattern(adaptor.getRhs(), m_One())) return getLhs(); + // (a * b) / b -> a + if (Value val = foldDivMul(getLhs(), getRhs(), IntegerOverflowFlags::nsw)) + return val; + // Don't fold if it would overflow or if it requires a division by zero. bool overflowOrDiv0 = false; auto result = constFoldBinaryOp( diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 6a186a0c6ceca..522711b08f289 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -2060,6 +2060,70 @@ func.func @test_divf1(%arg0 : f32, %arg1 : f32) -> (f32) { // ----- +func.func @fold_divui_of_muli_0(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divui %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divui_of_muli_0( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG1]] + +func.func @fold_divui_of_muli_1(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divui %0, %arg1 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divui_of_muli_1( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG0]] + +func.func @fold_divsi_of_muli_0(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divsi %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divsi_of_muli_0( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG1]] + +func.func @fold_divsi_of_muli_1(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 overflow : index + %1 = arith.divsi %0, %arg1 : index + return %1 : index +} +// CHECK-LABEL: func @fold_divsi_of_muli_1( +// CHECK-SAME: %[[ARG0:.+]]: index, +// CHECK-SAME: %[[ARG1:.+]]: index) +// CHECK: return %[[ARG0]] + +// Do not fold divui(mul(a, v), v) -> a with nuw attribute. +func.func @no_fold_divui_of_muli(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 : index + %1 = arith.divui %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @no_fold_divui_of_muli +// CHECK: %[[T0:.+]] = arith.muli +// CHECK: %[[T1:.+]] = arith.divui %[[T0]], +// CHECK: return %[[T1]] + +// Do not fold divsi(mul(a, v), v) -> a with nuw attribute. +func.func @no_fold_divsi_of_muli(%arg0 : index, %arg1 : index) -> index { + %0 = arith.muli %arg0, %arg1 : index + %1 = arith.divsi %0, %arg0 : index + return %1 : index +} +// CHECK-LABEL: func @no_fold_divsi_of_muli +// CHECK: %[[T0:.+]] = arith.muli +// CHECK: %[[T1:.+]] = arith.divsi %[[T0]], +// CHECK: return %[[T1]] + +// ----- + // CHECK-LABEL: @test_cmpf( func.func @test_cmpf(%arg0 : f32) -> (i1, i1, i1, i1) { // CHECK-DAG: %[[T:.*]] = arith.constant true From 4dfea22e771a0944b3b313f2790a616fa79257e1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:19:32 -0800 Subject: [PATCH 097/480] [ExpandMemCmp][AArch64][PowerPC][RISCV][X86] Use llvm.ucmp instead of (sub (zext (icmp ugt)), (zext (icmp ult))). (#121530) AArch64 and PowerPC look like a improvements. RISC-V is neutral. X86 trades a dependency breaking xor before a seta for a movsx after a sbbb. Depending on how the result is used, this movsx might go away. --- llvm/lib/CodeGen/ExpandMemCmp.cpp | 14 ++------ .../AArch64/machine-licm-hoist-load.ll | 3 +- llvm/test/CodeGen/AArch64/memcmp.ll | 15 +++----- llvm/test/CodeGen/PowerPC/memcmp.ll | 18 +++++----- llvm/test/CodeGen/PowerPC/memcmpIR.ll | 16 +++------ llvm/test/CodeGen/RISCV/memcmp-optsize.ll | 36 +++++++++---------- llvm/test/CodeGen/RISCV/memcmp.ll | 36 +++++++++---------- .../CodeGen/X86/memcmp-more-load-pairs-x32.ll | 10 +++--- .../CodeGen/X86/memcmp-more-load-pairs.ll | 24 ++++++------- llvm/test/CodeGen/X86/memcmp-optsize-x32.ll | 10 +++--- llvm/test/CodeGen/X86/memcmp-optsize.ll | 24 ++++++------- llvm/test/CodeGen/X86/memcmp-pgso-x32.ll | 10 +++--- llvm/test/CodeGen/X86/memcmp-pgso.ll | 24 ++++++------- llvm/test/CodeGen/X86/memcmp-x32.ll | 10 +++--- llvm/test/CodeGen/X86/memcmp.ll | 24 ++++++------- .../Transforms/ExpandMemCmp/AArch64/memcmp.ll | 30 +++------------- .../Transforms/ExpandMemCmp/X86/memcmp-x32.ll | 6 +--- .../Transforms/ExpandMemCmp/X86/memcmp.ll | 12 ++----- 18 files changed, 133 insertions(+), 189 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index f8ca7e370f6ef..6dc3e04ac802c 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -696,17 +696,9 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { } } - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs); - Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); + // The result of memcmp is negative, zero, or positive. + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::ucmp, + {Loads.Lhs, Loads.Rhs}); } // This function expands the memcmp call into an inline expansion and returns diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll index 17f8263560430..a32c53a5a5747 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll @@ -313,9 +313,8 @@ define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) { ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w9, w10 ; CHECK-NEXT: cset w9, hi -; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: csinv w9, w9, wzr, hs ; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: sub w9, w9, w10 ; CHECK-NEXT: strb w9, [x2], #1 ; CHECK-NEXT: b.ne .LBB4_1 ; CHECK-NEXT: // %bb.2: // %for.exit diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll index 4da7c8c95a4e4..4f58fd74d7d50 100644 --- a/llvm/test/CodeGen/AArch64/memcmp.ll +++ b/llvm/test/CodeGen/AArch64/memcmp.ll @@ -162,8 +162,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind ret i32 %m @@ -194,8 +193,7 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -286,8 +284,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind ret i32 %m @@ -341,8 +338,7 @@ define i32 @length6(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind ret i32 %m @@ -450,8 +446,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: cset w9, lo -; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll index 0634534b9c9df..39f9269997315 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp.ll +++ b/llvm/test/CodeGen/PowerPC/memcmp.ll @@ -6,13 +6,12 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: ldbrx 3, 0, 3 ; CHECK-NEXT: ldbrx 4, 0, 4 -; CHECK-NEXT: subc 5, 4, 3 -; CHECK-NEXT: subfe 5, 4, 4 -; CHECK-NEXT: subc 4, 3, 4 -; CHECK-NEXT: subfe 3, 3, 3 -; CHECK-NEXT: neg 5, 5 +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: subc 3, 4, 3 +; CHECK-NEXT: subfe 3, 4, 4 +; CHECK-NEXT: li 4, -1 ; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: sub 3, 5, 3 +; CHECK-NEXT: isellt 3, 4, 3 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8) @@ -24,12 +23,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: lwbrx 3, 0, 3 ; CHECK-NEXT: lwbrx 4, 0, 4 +; CHECK-NEXT: cmplw 3, 4 ; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: sub 3, 3, 4 +; CHECK-NEXT: li 3, -1 ; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: rldicl 3, 3, 1, 63 -; CHECK-NEXT: sub 3, 5, 3 -; CHECK-NEXT: extsw 3, 3 +; CHECK-NEXT: isellt 3, 3, 5 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4) ret i32 %call diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll index 0a8bec7dc0e3f..b57d2b5116b77 100644 --- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll +++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll @@ -59,22 +59,14 @@ define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonl ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) - ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32 - ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32 - ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]] - ; CHECK-NEXT: ret i32 [[SUB]] + ; CHECK-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[BSWAP1]], i32 [[BSWAP2]]) + ; CHECK-NEXT: ret i32 [[UCMP]] ; CHECK-BE-LABEL: @test2( ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr - ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32 - ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32 - ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]] - ; CHECK-BE-NEXT: ret i32 [[SUB]] + ; CHECK-BE-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[LOAD1]], i32 [[LOAD2]]) + ; CHECK-BE-NEXT: ret i32 [[UCMP]] entry: %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4) diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index b9a27b9d0c9e7..829fdd5592683 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2648,9 +2648,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4: @@ -2661,9 +2661,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4: @@ -2672,9 +2672,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4: @@ -2685,9 +2685,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4: @@ -3462,9 +3462,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8: @@ -3495,9 +3495,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8: diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 629a9298ee469..bc6cf0a858be8 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3344,9 +3344,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4: @@ -3357,9 +3357,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4: @@ -3368,9 +3368,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4: @@ -3381,9 +3381,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4: @@ -4158,9 +4158,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8: @@ -4191,9 +4191,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8: diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll index ee5fd78c64379..62935f7e372b3 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -193,13 +193,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll index a46f9ed3d3798..9bbd335a903be 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -179,14 +179,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind { define i32 @length4(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -391,14 +391,14 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { define i32 @length8(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll index 4a9643c0f4fc8..3a16ab656b11f 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll index 4e27301436c34..0f817b2c727c3 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize { define i32 @length4(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize { define i32 @length8(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll index bdb50f5b60c49..35fd373536bd3 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll index 9347e54220220..f638852923187 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 { define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 { define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll index ad9f2a30d75bb..4a3f5a608e585 100644 --- a/llvm/test/CodeGen/X86/memcmp-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -221,13 +221,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 8fe1a581cd9c2..014db33160606 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -205,14 +205,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind { define i32 @length4(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -417,14 +417,14 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind { define i32 @length8(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll index 92439691e1873..179b5b0a3dbf5 100644 --- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll @@ -45,11 +45,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP4:%.*]] = zext i24 [[TMP2]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3) @@ -63,11 +59,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[Y]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP3]], i32 [[TMP4]]) ; CHECK-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4) @@ -83,11 +75,7 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP4:%.*]] = zext i40 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5) @@ -103,11 +91,7 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP4:%.*]] = zext i48 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6) @@ -155,11 +139,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[Y]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP3]], i64 [[TMP4]]) ; CHECK-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8) diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll index d71ae8be19b66..0507ec9de542e 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll @@ -71,11 +71,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X32-NEXT: [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1 ; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; X32-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; X32-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; X32-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; X32-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X32-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X32-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]]) ; X32-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4) diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll index f686e29975564..86dc3e5245f24 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -73,11 +73,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X64-NEXT: [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1 ; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; X64-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]]) ; X64-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4) @@ -189,11 +185,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X64-NEXT: [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1 ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) ; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]]) ; X64-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8) From 715dcb2310a4378fdf324cd3d3b47d6f160842aa Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:19:54 -0800 Subject: [PATCH 098/480] [ExpandMemCmp] Use m_SpecificInt to simplify code. NFC (#121532) --- llvm/lib/CodeGen/ExpandMemCmp.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 6dc3e04ac802c..cc75a01c6477a 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -669,15 +669,15 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { if (CI->hasOneUser()) { auto *UI = cast(*CI->user_begin()); CmpPredicate Pred = ICmpInst::Predicate::BAD_ICMP_PREDICATE; - uint64_t Shift; bool NeedsZExt = false; // This is a special case because instead of checking if the result is less // than zero: // bool result = memcmp(a, b, NBYTES) < 0; // Compiler is clever enough to generate the following code: // bool result = memcmp(a, b, NBYTES) >> 31; - if (match(UI, m_LShr(m_Value(), m_ConstantInt(Shift))) && - Shift == (CI->getType()->getIntegerBitWidth() - 1)) { + if (match(UI, + m_LShr(m_Value(), + m_SpecificInt(CI->getType()->getIntegerBitWidth() - 1)))) { Pred = ICmpInst::ICMP_SLT; NeedsZExt = true; } else { From c19f0f005a1ccf21bd2f0656f90455a55413a32f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:20:30 -0800 Subject: [PATCH 099/480] [PatternMatch] Make m_SpecificMask pass expected mask by Value. NFC (#121527) Unlike m_Mask, we don't need to modify a variable owned by the caller so we should pass the ArrayRef by value or const reference. --- llvm/include/llvm/IR/PatternMatch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index cc0e8d598ff1e..b37f967191aaa 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -1844,9 +1844,9 @@ struct m_ZeroMask { }; struct m_SpecificMask { - ArrayRef &MaskRef; - m_SpecificMask(ArrayRef &MaskRef) : MaskRef(MaskRef) {} - bool match(ArrayRef Mask) { return MaskRef == Mask; } + ArrayRef Val; + m_SpecificMask(ArrayRef Val) : Val(Val) {} + bool match(ArrayRef Mask) { return Val == Mask; } }; struct m_SplatOrPoisonMask { From e6f76378c20bebec85f66c1574bb6bb928a79025 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 3 Jan 2025 09:25:08 -0800 Subject: [PATCH 100/480] EntryExitInstrumenter: skip available_externally linkage gnu::always_inline functions, which lower to available_externally, may not have definitions external to the module. -finstrument-function family options instrumentating the function (which takes the function address) may lead to a linker error if the function is not optimized out, e.g. ``` // -std=c++17 or above with libstdc++ #include std::string str; int main() {} ``` Simplified reproduce: ``` template struct A { [[gnu::always_inline]] T bar(T a) { return a * 2; } }; extern template class A; int main(int argc, char **argv) { return A().bar(argc); } ``` GCC's -finstrument-function instrumentation skips such functions (https://gcc.gnu.org/PR78333). Let's skip such functions (available_externally) as well. Fix #50742 Pull Request: https://github.com/llvm/llvm-project/pull/121452 --- llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp | 6 ++++++ llvm/test/Transforms/EntryExitInstrumenter/mcount.ll | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 5b33edd51cffa..d47f1b4253b54 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -103,6 +103,12 @@ static bool runOnFunction(Function &F, bool PostInlining) { if (F.hasFnAttribute(Attribute::Naked)) return false; + // available_externally functions may not have definitions external to the + // module (e.g. gnu::always_inline). Instrumenting them might lead to linker + // errors if they are optimized out. Skip them like GCC. + if (F.hasAvailableExternallyLinkage()) + return false; + StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined" : "instrument-function-entry"; diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll index bd5f4c2b51a89..56ccfb9ed2e7e 100644 --- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll +++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll @@ -129,6 +129,13 @@ define void @naked() naked { ret void } +define available_externally void @always_inline() { +; CHECK-LABEL: define available_externally void @always_inline() { +; CHECK-NEXT: ret void +; + ret void +} + ; The attributes are "consumed" when the instrumentation is inserted. ; CHECK: attributes ; CHECK-NOT: instrument-function From ee9be864bcc5e3cc89f5f23485db2285ad7119f7 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Fri, 3 Jan 2025 09:38:04 -0800 Subject: [PATCH 101/480] [NFC] Fix a typo (#121545) `InputSectionBase::relsOrRelas` make at most one array-ref non-empty. One-off counter (as debugging log) shows the number of empty member containers is 2 or 3 in a real build. Fix the typo. --- lld/ELF/InputSection.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index 60988dfacbd74..98e7d5d4ff0cd 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -33,7 +33,7 @@ class SyntheticSection; template class ObjFile; class OutputSection; -// Returned by InputSectionBase::relsOrRelas. At most one member is empty. +// Returned by InputSectionBase::relsOrRelas. At least two members are empty. template struct RelsOrRelas { Relocs rels; Relocs relas; From 82fdd103f9484ce85ec64e3d013cfd8000e22fea Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Jan 2025 16:55:27 +0000 Subject: [PATCH 102/480] [X86] Add test coverage for #107423 --- llvm/test/CodeGen/X86/pr107423.ll | 74 +++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr107423.ll diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll new file mode 100644 index 0000000000000..d5119d45f97c0 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr107423.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s + +define void @PR107423(<64 x i8> %arg, ptr %p0) { +; CHECK-LABEL: PR107423: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm2, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm3 +; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm4 +; CHECK-NEXT: vpaddb %xmm1, %xmm4, %xmm1 +; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm4 +; CHECK-NEXT: vpsllw $8, %xmm4, %xmm4 +; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm1 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu %xmm0, 16(%rdi) +; CHECK-NEXT: vmovdqu %xmm2, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %i3 = bitcast <64 x i8> %arg to <32 x i16> + %i4 = shufflevector <32 x i16> %i3, <32 x i16> poison, <8 x i32> + %i5 = shl <8 x i16> %i4, + %i6 = bitcast <8 x i16> %i5 to <16 x i8> + %i7 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> + %i8 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> + %i9 = shufflevector <64 x i8> %i7, <64 x i8> %i8, <64 x i32> + %i10 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> + %i11 = shufflevector <64 x i8> %i10, <64 x i8> %i9, <64 x i32> + %i12 = bitcast <64 x i8> %i11 to <32 x i16> + %i13 = shl <32 x i16> %i12, + %i14 = bitcast <32 x i16> %i13 to <64 x i8> + %i15 = shufflevector <64 x i8> %i14, <64 x i8> poison, <16 x i32> + %i16 = shufflevector <64 x i8> %i11, <64 x i8> poison, <64 x i32> + %i17 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> + %i18 = shufflevector <64 x i8> %i16, <64 x i8> %i17, <64 x i32> + %i19 = shufflevector <16 x i8> %i15, <16 x i8> poison, <64 x i32> + %i20 = shufflevector <64 x i8> %i19, <64 x i8> %i18, <64 x i32> + %i21 = add <64 x i8> %i20, %i11 + %i22 = bitcast <64 x i8> %i21 to <32 x i16> + %i23 = shl <32 x i16> %i22, + %i24 = bitcast <32 x i16> %i23 to <64 x i8> + %i25 = shufflevector <64 x i8> %i24, <64 x i8> poison, <16 x i32> + %i26 = bitcast <32 x i16> %i23 to <64 x i8> + %i28 = shufflevector <64 x i8> %i26, <64 x i8> poison, <16 x i32> + %i32 = shufflevector <64 x i8> %i21, <64 x i8> poison, <64 x i32> + %i33 = shufflevector <16 x i8> %i25, <16 x i8> poison, <64 x i32> + %i34 = shufflevector <64 x i8> %i32, <64 x i8> %i33, <64 x i32> + %i35 = shufflevector <16 x i8> %i28, <16 x i8> poison, <64 x i32> + %i36 = shufflevector <64 x i8> %i35, <64 x i8> %i34, <64 x i32> + %i37 = add <64 x i8> %i36, %i21 + %i38 = bitcast <64 x i8> %i37 to <32 x i16> + %i39 = shufflevector <32 x i16> %i38, <32 x i16> poison, <8 x i32> + %i40 = shl <8 x i16> %i39, + %i41 = bitcast <8 x i16> %i40 to <16 x i8> + %i42 = shufflevector <16 x i8> %i41, <16 x i8> poison, <64 x i32> + %i43 = shufflevector <64 x i8> %i42, <64 x i8> %i37, <64 x i32> + %i44 = bitcast <64 x i8> %i43 to <32 x i16> + %i45 = shufflevector <32 x i16> %i44, <32 x i16> poison, <8 x i32> + %i46 = shl <8 x i16> %i45, + %i47 = bitcast <8 x i16> %i46 to <16 x i8> + %i48 = shufflevector <16 x i8> %i47, <16 x i8> poison, <64 x i32> + %i49 = shufflevector <64 x i8> %i43, <64 x i8> %i48, <32 x i32> + %i50 = shufflevector <64 x i8> %i37, <64 x i8> poison, <32 x i32> + %i51 = add <32 x i8> %i49, %i50 + store <32 x i8> %i51, ptr %p0, align 1 + ret void +} From 13cf5c9c227a502f86f8c0e3c7d5fe147bc91b8b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 09:48:38 -0800 Subject: [PATCH 103/480] [RISCV] Re-generate memcmp test checks missed in #121530. NFC A patch landed to these tests while #121530 was in review and I forgot to rebase. --- llvm/test/CodeGen/RISCV/memcmp-optsize.ll | 48 +++++++++++------------ llvm/test/CodeGen/RISCV/memcmp.ll | 48 +++++++++++------------ 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index 829fdd5592683..f9086ba9d6354 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2459,9 +2459,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: @@ -2478,9 +2478,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: @@ -2493,9 +2493,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: @@ -2512,9 +2512,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -2845,9 +2845,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -2878,9 +2878,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3044,9 +3044,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3083,9 +3083,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index bc6cf0a858be8..831e21af43807 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3155,9 +3155,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: @@ -3174,9 +3174,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3: @@ -3189,9 +3189,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: @@ -3208,9 +3208,9 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3: @@ -3541,9 +3541,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5: @@ -3574,9 +3574,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5: @@ -3740,9 +3740,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6: @@ -3779,9 +3779,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6: From bad0f98bda1ca0b8a106b14b9cce98bf1dbc15cc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Jan 2025 17:47:31 -0800 Subject: [PATCH 104/480] [ExpandMemCmp][AArch][RISCV][X86] Pre-commit tests for recognizing canonical form of (icmp sle/sge X, 0). NFC Pre-commit for #121540. --- llvm/test/CodeGen/AArch64/memcmp.ll | 378 ++++++------ llvm/test/CodeGen/RISCV/memcmp.ll | 910 +++++++++++++++++++++++----- llvm/test/CodeGen/X86/memcmp.ll | 126 ++-- 3 files changed, 1047 insertions(+), 367 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll index 4f58fd74d7d50..864f38468842a 100644 --- a/llvm/test/CodeGen/AArch64/memcmp.ll +++ b/llvm/test/CodeGen/AArch64/memcmp.ll @@ -257,6 +257,42 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind { ret i1 %c } +define i1 @length4_le(ptr %X, ptr %Y) nounwind { +; CHECK-LABEL: length4_le: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: rev w9, w9 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: csinv w8, w8, wzr, hs +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp slt i32 %m, 1 + ret i1 %c +} + +define i1 @length4_ge(ptr %X, ptr %Y) nounwind { +; CHECK-LABEL: length4_ge: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: rev w9, w9 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: csinv w8, w8, wzr, hs +; CHECK-NEXT: mvn w8, w8 +; CHECK-NEXT: lsr w0, w8, #31 +; CHECK-NEXT: ret + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp sgt i32 %m, -1 + ret i1 %c +} + define i1 @length4_eq_const(ptr %X) nounwind { ; CHECK-LABEL: length4_eq_const: ; CHECK: // %bb.0: @@ -371,18 +407,18 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB24_3 +; CHECK-NEXT: b.ne .LBB26_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur w8, [x0, #3] ; CHECK-NEXT: ldur w9, [x1, #3] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB24_3 +; CHECK-NEXT: b.ne .LBB26_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB24_3: // %res_block +; CHECK-NEXT: .LBB26_3: // %res_block ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -399,18 +435,18 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB25_3 +; CHECK-NEXT: b.ne .LBB27_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur w8, [x0, #3] ; CHECK-NEXT: ldur w9, [x1, #3] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB25_3 +; CHECK-NEXT: b.ne .LBB27_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB25_3: // %res_block +; CHECK-NEXT: .LBB27_3: // %res_block ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -489,13 +525,13 @@ define i32 @length9(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB30_2 +; CHECK-NEXT: b.ne .LBB32_2 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldrb w8, [x0, #8] ; CHECK-NEXT: ldrb w9, [x1, #8] ; CHECK-NEXT: sub w0, w8, w9 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB30_2: // %res_block +; CHECK-NEXT: .LBB32_2: // %res_block ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs ; CHECK-NEXT: ret @@ -527,7 +563,7 @@ define i32 @length10(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB32_3 +; CHECK-NEXT: b.ne .LBB34_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldrh w8, [x0, #8] ; CHECK-NEXT: ldrh w9, [x1, #8] @@ -536,11 +572,11 @@ define i32 @length10(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: lsr w8, w8, #16 ; CHECK-NEXT: lsr w9, w9, #16 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB32_3 +; CHECK-NEXT: b.ne .LBB34_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB32_3: // %res_block +; CHECK-NEXT: .LBB34_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -573,18 +609,18 @@ define i32 @length11(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB34_3 +; CHECK-NEXT: b.ne .LBB36_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur x8, [x0, #3] ; CHECK-NEXT: ldur x9, [x1, #3] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB34_3 +; CHECK-NEXT: b.ne .LBB36_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB34_3: // %res_block +; CHECK-NEXT: .LBB36_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -633,18 +669,18 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB37_3 +; CHECK-NEXT: b.ne .LBB39_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr w8, [x0, #8] ; CHECK-NEXT: ldr w9, [x1, #8] ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB37_3 +; CHECK-NEXT: b.ne .LBB39_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB37_3: // %res_block +; CHECK-NEXT: .LBB39_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -693,18 +729,18 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB40_3 +; CHECK-NEXT: b.ne .LBB42_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur x8, [x0, #7] ; CHECK-NEXT: ldur x9, [x1, #7] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB40_3 +; CHECK-NEXT: b.ne .LBB42_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB40_3: // %res_block +; CHECK-NEXT: .LBB42_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -721,18 +757,18 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB41_3 +; CHECK-NEXT: b.ne .LBB43_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldur x8, [x0, #7] ; CHECK-NEXT: ldur x9, [x1, #7] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB41_3 +; CHECK-NEXT: b.ne .LBB43_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB41_3: // %res_block +; CHECK-NEXT: .LBB43_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -753,7 +789,7 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #12594, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB42_3 +; CHECK-NEXT: b.ne .LBB44_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: mov x8, #13365 // =0x3435 ; CHECK-NEXT: ldur x9, [x0, #7] @@ -762,11 +798,11 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #14393, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB42_3 +; CHECK-NEXT: b.ne .LBB44_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB42_3: // %res_block +; CHECK-NEXT: .LBB44_3: // %res_block ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -801,7 +837,7 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #12594, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB44_3 +; CHECK-NEXT: b.ne .LBB46_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: mov x8, #13365 // =0x3435 ; CHECK-NEXT: ldur x9, [x0, #7] @@ -810,15 +846,15 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: movk x8, #14393, lsl #48 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: b.ne .LBB44_3 +; CHECK-NEXT: b.ne .LBB46_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB44_4 -; CHECK-NEXT: .LBB44_3: // %res_block +; CHECK-NEXT: b .LBB46_4 +; CHECK-NEXT: .LBB46_3: // %res_block ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB44_4: // %endblock +; CHECK-NEXT: .LBB46_4: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -836,18 +872,18 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB45_3 +; CHECK-NEXT: b.ne .LBB47_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB45_3 +; CHECK-NEXT: b.ne .LBB47_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB45_3: // %res_block +; CHECK-NEXT: .LBB47_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -878,18 +914,18 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB47_3 +; CHECK-NEXT: b.ne .LBB49_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB47_3 +; CHECK-NEXT: b.ne .LBB49_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB47_3: // %res_block +; CHECK-NEXT: .LBB49_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -908,22 +944,22 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB48_3 +; CHECK-NEXT: b.ne .LBB50_3 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB48_3 +; CHECK-NEXT: b.ne .LBB50_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB48_4 -; CHECK-NEXT: .LBB48_3: // %res_block +; CHECK-NEXT: b .LBB50_4 +; CHECK-NEXT: .LBB50_3: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB48_4: // %endblock +; CHECK-NEXT: .LBB50_4: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -962,25 +998,25 @@ define i32 @length24(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB50_4 +; CHECK-NEXT: b.ne .LBB52_4 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB50_4 +; CHECK-NEXT: b.ne .LBB52_4 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB50_4 +; CHECK-NEXT: b.ne .LBB52_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB50_4: // %res_block +; CHECK-NEXT: .LBB52_4: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1014,25 +1050,25 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB52_4 +; CHECK-NEXT: b.ne .LBB54_4 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB52_4 +; CHECK-NEXT: b.ne .LBB54_4 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB52_4 +; CHECK-NEXT: b.ne .LBB54_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB52_4: // %res_block +; CHECK-NEXT: .LBB54_4: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1051,29 +1087,29 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB53_4 +; CHECK-NEXT: b.ne .LBB55_4 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB53_4 +; CHECK-NEXT: b.ne .LBB55_4 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB53_4 +; CHECK-NEXT: b.ne .LBB55_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB53_5 -; CHECK-NEXT: .LBB53_4: // %res_block +; CHECK-NEXT: b .LBB55_5 +; CHECK-NEXT: .LBB55_4: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB53_5: // %endblock +; CHECK-NEXT: .LBB55_5: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1117,32 +1153,32 @@ define i32 @length31(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldur x8, [x0, #23] ; CHECK-NEXT: ldur x9, [x1, #23] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB55_5 +; CHECK-NEXT: b.ne .LBB57_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB55_5: // %res_block +; CHECK-NEXT: .LBB57_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1179,32 +1215,32 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldur x8, [x0, #23] ; CHECK-NEXT: ldur x9, [x1, #23] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB57_5 +; CHECK-NEXT: b.ne .LBB59_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB57_5: // %res_block +; CHECK-NEXT: .LBB59_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1223,36 +1259,36 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldur x8, [x0, #23] ; CHECK-NEXT: ldur x9, [x1, #23] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB58_5 +; CHECK-NEXT: b.ne .LBB60_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB58_6 -; CHECK-NEXT: .LBB58_5: // %res_block +; CHECK-NEXT: b .LBB60_6 +; CHECK-NEXT: .LBB60_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB58_6: // %endblock +; CHECK-NEXT: .LBB60_6: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1322,32 +1358,32 @@ define i32 @length32(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB61_5 +; CHECK-NEXT: b.ne .LBB63_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB61_5: // %res_block +; CHECK-NEXT: .LBB63_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1383,32 +1419,32 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB63_5 +; CHECK-NEXT: b.ne .LBB65_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB63_5: // %res_block +; CHECK-NEXT: .LBB65_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1427,36 +1463,36 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB64_5 +; CHECK-NEXT: b.ne .LBB66_5 ; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB64_6 -; CHECK-NEXT: .LBB64_5: // %res_block +; CHECK-NEXT: b .LBB66_6 +; CHECK-NEXT: .LBB66_5: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB64_6: // %endblock +; CHECK-NEXT: .LBB66_6: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1523,46 +1559,46 @@ define i32 @length48(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB67_7 +; CHECK-NEXT: b.ne .LBB69_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB67_7: // %res_block +; CHECK-NEXT: .LBB69_7: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1601,46 +1637,46 @@ define i1 @length48_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB69_7 +; CHECK-NEXT: b.ne .LBB71_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB69_7: // %res_block +; CHECK-NEXT: .LBB71_7: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1659,50 +1695,50 @@ define i1 @length48_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB70_7 +; CHECK-NEXT: b.ne .LBB72_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB70_8 -; CHECK-NEXT: .LBB70_7: // %res_block +; CHECK-NEXT: b .LBB72_8 +; CHECK-NEXT: .LBB72_7: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB70_8: // %endblock +; CHECK-NEXT: .LBB72_8: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -1780,60 +1816,60 @@ define i32 @length63(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldur x8, [x0, #55] ; CHECK-NEXT: ldur x9, [x1, #55] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB73_9 +; CHECK-NEXT: b.ne .LBB75_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB73_9: // %res_block +; CHECK-NEXT: .LBB75_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -1878,60 +1914,60 @@ define i1 @length63_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldur x8, [x0, #55] ; CHECK-NEXT: ldur x9, [x1, #55] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB75_9 +; CHECK-NEXT: b.ne .LBB77_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB75_9: // %res_block +; CHECK-NEXT: .LBB77_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -1950,64 +1986,64 @@ define i1 @length63_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldur x8, [x0, #55] ; CHECK-NEXT: ldur x9, [x1, #55] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB76_9 +; CHECK-NEXT: b.ne .LBB78_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB76_10 -; CHECK-NEXT: .LBB76_9: // %res_block +; CHECK-NEXT: b .LBB78_10 +; CHECK-NEXT: .LBB78_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB76_10: // %endblock +; CHECK-NEXT: .LBB78_10: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -2071,60 +2107,60 @@ define i32 @length64(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldr x8, [x0, #56] ; CHECK-NEXT: ldr x9, [x1, #56] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB78_9 +; CHECK-NEXT: b.ne .LBB80_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB78_9: // %res_block +; CHECK-NEXT: .LBB80_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w0, w8, hs @@ -2167,60 +2203,60 @@ define i1 @length64_lt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldr x8, [x0, #56] ; CHECK-NEXT: ldr x9, [x1, #56] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB80_9 +; CHECK-NEXT: b.ne .LBB82_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: lsr w0, wzr, #31 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB80_9: // %res_block +; CHECK-NEXT: .LBB82_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs @@ -2239,64 +2275,64 @@ define i1 @length64_gt(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.1: // %loadbb1 ; CHECK-NEXT: ldr x8, [x0, #8] ; CHECK-NEXT: ldr x9, [x1, #8] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.2: // %loadbb2 ; CHECK-NEXT: ldr x8, [x0, #16] ; CHECK-NEXT: ldr x9, [x1, #16] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.3: // %loadbb3 ; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: ldr x9, [x1, #24] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.4: // %loadbb4 ; CHECK-NEXT: ldr x8, [x0, #32] ; CHECK-NEXT: ldr x9, [x1, #32] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.5: // %loadbb5 ; CHECK-NEXT: ldr x8, [x0, #40] ; CHECK-NEXT: ldr x9, [x1, #40] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.6: // %loadbb6 ; CHECK-NEXT: ldr x8, [x0, #48] ; CHECK-NEXT: ldr x9, [x1, #48] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.7: // %loadbb7 ; CHECK-NEXT: ldr x8, [x0, #56] ; CHECK-NEXT: ldr x9, [x1, #56] ; CHECK-NEXT: rev x8, x8 ; CHECK-NEXT: rev x9, x9 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: b.ne .LBB81_9 +; CHECK-NEXT: b.ne .LBB83_9 ; CHECK-NEXT: // %bb.8: ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB81_10 -; CHECK-NEXT: .LBB81_9: // %res_block +; CHECK-NEXT: b .LBB83_10 +; CHECK-NEXT: .LBB83_9: // %res_block ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: cneg w8, w8, hs -; CHECK-NEXT: .LBB81_10: // %endblock +; CHECK-NEXT: .LBB83_10: // %endblock ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 831e21af43807..5adda28acb427 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -2710,6 +2710,216 @@ entry: ret i1 %ret } +define i1 @bcmp_le_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call bcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call bcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call bcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: bcmp_le_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call bcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-LABEL: bcmp_le_zero: +; CHECK-UNALIGNED: # %bb.0: # %entry +; CHECK-UNALIGNED-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-NEXT: xor a0, a0, a1 +; CHECK-UNALIGNED-NEXT: snez a0, a0 +; CHECK-UNALIGNED-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp slt i32 %bcmp, 1 + ret i1 %ret +} + +define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call bcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call bcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call bcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call bcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: bcmp_ge_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call bcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-LABEL: bcmp_ge_zero: +; CHECK-UNALIGNED: # %bb.0: # %entry +; CHECK-UNALIGNED-NEXT: li a0, 1 +; CHECK-UNALIGNED-NEXT: ret +entry: + %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp sgt i32 %bcmp, -1 + ret i1 %ret +} + define i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind { ; CHECK-LABEL: memcmp_size_0: ; CHECK: # %bb.0: # %entry @@ -3517,13 +3727,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB26_2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB28_2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB26_2: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB28_2: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -3552,13 +3762,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB26_2 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB28_2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB26_2: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB28_2: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -3710,7 +3920,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a1, 4(a1) @@ -3718,11 +3928,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB27_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB29_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -3751,7 +3961,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a1, 4(a1) @@ -3759,11 +3969,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB27_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB27_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB29_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -3915,17 +4125,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -3939,7 +4149,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 3(a1) @@ -3947,11 +4157,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -3963,17 +4173,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -3987,7 +4197,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 3(a1) @@ -3995,11 +4205,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB28_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB30_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -4136,17 +4346,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB29_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB31_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4169,17 +4379,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB29_3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB29_3: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB31_3: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4327,29 +4537,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 11(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 11(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB30_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB32_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4361,17 +4571,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 7(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 7(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB30_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB32_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -4383,29 +4593,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 11(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 11(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB30_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB30_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB32_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4417,17 +4627,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 7(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 7(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB30_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB30_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB32_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -4564,29 +4774,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 12(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB31_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB33_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4598,17 +4808,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB31_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB33_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -4620,29 +4830,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 12(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB31_5 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB31_5: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB33_5: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4654,17 +4864,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB31_3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB31_3: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB33_3: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -4801,53 +5011,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 27(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB32_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB34_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -4859,29 +5069,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 23(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 23(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB32_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB34_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -4893,53 +5103,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 27(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB32_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB34_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB32_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB34_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -4951,29 +5161,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 23(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 23(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB32_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB32_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB34_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -5110,53 +5320,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 28(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB33_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBB-NEXT: .LBB35_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ori a0, a0, 1 @@ -5168,29 +5378,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 24(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB33_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB35_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -5202,53 +5412,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 8(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 12(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 16(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 16(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 20(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 20(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a2, 24(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a3, 24(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 28(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB33_9 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB35_9 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB33_9: # %res_block +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: .LBB35_9: # %res_block ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ori a0, a0, 1 @@ -5260,29 +5470,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 24(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB33_5 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB33_5: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB35_5: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -5379,53 +5589,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 55(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB34_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB36_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -5437,53 +5647,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 55(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB34_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB36_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB34_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB36_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -5570,53 +5780,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 56(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB35_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBB-NEXT: .LBB37_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ori a0, a0, 1 @@ -5628,53 +5838,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 8(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 8(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: # %loadbb2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 16(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.3: # %loadbb3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 24(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.4: # %loadbb4 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 32(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 32(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.5: # %loadbb5 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 40(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 40(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.6: # %loadbb6 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a2, 48(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a3, 48(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.7: # %loadbb7 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 56(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB35_9 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB37_9 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.8: ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: li a0, 0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB35_9: # %res_block +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: .LBB37_9: # %res_block ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: neg a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ori a0, a0, 1 @@ -6336,5 +6546,401 @@ entry: %ret = icmp sgt i32 %memcmp, 0 ret i1 %ret } + +define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call memcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call memcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call memcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: memcmp_le_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call memcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-RV32-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-NEXT: ret +; +; CHECK-UNALIGNED-RV64-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-V-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-V-NEXT: ret +; +; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_le_zero: +; CHECK-UNALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-V-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-V-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp slt i32 %memcmp, 1 + ret i1 %ret +} + +define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { +; CHECK-ALIGNED-RV32-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-NEXT: call memcmp +; CHECK-ALIGNED-RV32-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT: ret +; +; CHECK-ALIGNED-RV64-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-NEXT: call memcmp +; CHECK-ALIGNED-RV64-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-ZBKB-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: call memcmp +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-ZBKB-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-ALIGNED-RV32-V-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-ALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV32-V-NEXT: call memcmp +; CHECK-ALIGNED-RV32-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV32-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-ALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV32-V-NEXT: ret +; +; CHECK-ALIGNED-RV64-V-LABEL: memcmp_ge_zero: +; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-ALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-ALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-ALIGNED-RV64-V-NEXT: call memcmp +; CHECK-ALIGNED-RV64-V-NEXT: slti a0, a0, 0 +; CHECK-ALIGNED-RV64-V-NEXT: xori a0, a0, 1 +; CHECK-ALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-ALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-ALIGNED-RV64-V-NEXT: ret +; +; CHECK-UNALIGNED-RV32-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-NEXT: ret +; +; CHECK-UNALIGNED-RV64-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret +; +; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV32-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV32-V-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-UNALIGNED-RV32-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV32-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV32-V-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-V-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV32-V-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-UNALIGNED-RV32-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-V-NEXT: ret +; +; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_ge_zero: +; CHECK-UNALIGNED-RV64-V: # %bb.0: # %entry +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, -16 +; CHECK-UNALIGNED-RV64-V-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-UNALIGNED-RV64-V-NEXT: li a2, 4 +; CHECK-UNALIGNED-RV64-V-NEXT: call memcmp +; CHECK-UNALIGNED-RV64-V-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-V-NEXT: xori a0, a0, 1 +; CHECK-UNALIGNED-RV64-V-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-UNALIGNED-RV64-V-NEXT: addi sp, sp, 16 +; CHECK-UNALIGNED-RV64-V-NEXT: ret +entry: + %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4) + %ret = icmp sgt i32 %memcmp, -1 + ret i1 %ret +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-ALIGNED: {{.*}} diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 014db33160606..e744d2a06e55f 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -260,6 +260,44 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind { ret i1 %c } +define i1 @length4_le(ptr %X, ptr %Y) nounwind { +; X64-LABEL: length4_le: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setle %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp slt i32 %m, 1 + ret i1 %c +} + +define i1 @length4_ge(ptr %X, ptr %Y) nounwind { +; X64-LABEL: length4_ge: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setns %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp sgt i32 %m, -1 + ret i1 %c +} + define i1 @length4_eq_const(ptr %X) nounwind { ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: @@ -279,13 +317,13 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB18_3 +; X64-NEXT: jne .LBB20_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB18_3: # %res_block +; X64-NEXT: .LBB20_3: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax @@ -319,7 +357,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB20_3 +; X64-NEXT: jne .LBB22_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx @@ -327,7 +365,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB20_3: # %res_block +; X64-NEXT: .LBB22_3: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax @@ -348,7 +386,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB21_2 +; X64-NEXT: jne .LBB23_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 3(%rdi), %ecx ; X64-NEXT: movl 3(%rsi), %edx @@ -356,13 +394,13 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: je .LBB21_3 -; X64-NEXT: .LBB21_2: # %res_block +; X64-NEXT: je .LBB23_3 +; X64-NEXT: .LBB23_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB21_3: # %endblock +; X64-NEXT: .LBB23_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind ret i32 %m @@ -376,7 +414,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB22_2 +; X64-NEXT: jne .LBB24_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 3(%rdi), %ecx ; X64-NEXT: movl 3(%rsi), %edx @@ -384,13 +422,13 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: je .LBB22_3 -; X64-NEXT: .LBB22_2: # %res_block +; X64-NEXT: je .LBB24_3 +; X64-NEXT: .LBB24_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB22_3: # %endblock +; X64-NEXT: .LBB24_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -524,7 +562,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB31_2 +; X64-NEXT: jne .LBB33_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx @@ -532,13 +570,13 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB31_3 -; X64-NEXT: .LBB31_2: # %res_block +; X64-NEXT: je .LBB33_3 +; X64-NEXT: .LBB33_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB31_3: # %endblock +; X64-NEXT: .LBB33_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind ret i32 %m @@ -582,7 +620,7 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB34_2 +; X64-NEXT: jne .LBB36_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: movq 7(%rsi), %rdx @@ -590,13 +628,13 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB34_3 -; X64-NEXT: .LBB34_2: # %res_block +; X64-NEXT: je .LBB36_3 +; X64-NEXT: .LBB36_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB34_3: # %endblock +; X64-NEXT: .LBB36_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind ret i32 %m @@ -610,7 +648,7 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB35_2 +; X64-NEXT: jne .LBB37_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: movq 7(%rsi), %rdx @@ -618,13 +656,13 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB35_3 -; X64-NEXT: .LBB35_2: # %res_block +; X64-NEXT: je .LBB37_3 +; X64-NEXT: .LBB37_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB35_3: # %endblock +; X64-NEXT: .LBB37_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -640,20 +678,20 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: movq (%rdi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: jne .LBB36_2 +; X64-NEXT: jne .LBB38_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435 ; X64-NEXT: movq 7(%rdi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: je .LBB36_3 -; X64-NEXT: .LBB36_2: # %res_block +; X64-NEXT: je .LBB38_3 +; X64-NEXT: .LBB38_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB36_3: # %endblock +; X64-NEXT: .LBB38_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind ret i32 %m @@ -681,20 +719,20 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: jne .LBB38_2 +; X64-NEXT: jne .LBB40_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movabsq $4051322327650219061, %rax # imm = 0x3839303132333435 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: je .LBB38_3 -; X64-NEXT: .LBB38_2: # %res_block +; X64-NEXT: je .LBB40_3 +; X64-NEXT: .LBB40_2: # %res_block ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx ; X64-NEXT: sbbl %edx, %edx ; X64-NEXT: orl $1, %edx -; X64-NEXT: .LBB38_3: # %endblock +; X64-NEXT: .LBB40_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq @@ -713,7 +751,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB39_2 +; X64-NEXT: jne .LBB41_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -721,13 +759,13 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB39_3 -; X64-NEXT: .LBB39_2: # %res_block +; X64-NEXT: je .LBB41_3 +; X64-NEXT: .LBB41_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB39_3: # %endblock +; X64-NEXT: .LBB41_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind ret i32 %m @@ -783,7 +821,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB41_2 +; X64-NEXT: jne .LBB43_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -791,13 +829,13 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB41_3 -; X64-NEXT: .LBB41_2: # %res_block +; X64-NEXT: je .LBB43_3 +; X64-NEXT: .LBB43_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB41_3: # %endblock +; X64-NEXT: .LBB43_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -814,7 +852,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: jne .LBB42_2 +; X64-NEXT: jne .LBB44_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rax ; X64-NEXT: movq 8(%rsi), %rcx @@ -822,13 +860,13 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: je .LBB42_3 -; X64-NEXT: .LBB42_2: # %res_block +; X64-NEXT: je .LBB44_3 +; X64-NEXT: .LBB44_2: # %res_block ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: sbbl %edx, %edx ; X64-NEXT: orl $1, %edx -; X64-NEXT: .LBB42_3: # %endblock +; X64-NEXT: .LBB44_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq From 6d321530af6e83e51c2ed08463593af07ead9448 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 3 Jan 2025 09:15:27 -0800 Subject: [PATCH 105/480] [CG][RISCV]Add more RVV tests with exact vlen and linear/quadratic number of shuffles --- .../rvv/fixed-vectors-shuffle-exact-vlen.ll | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index f0ee780137300..bb05eb5368ae9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -312,3 +312,91 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range ret <4 x double> %5 } +define <16 x i32> @m4_square_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) { +; CHECK-LABEL: m4_square_num_of_shuffles_in_chunks: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI17_0) +; CHECK-NEXT: vl1r.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> + ret <16 x i32> %1 +} + +define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) { +; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) +; CHECK-NEXT: vl2re16.v v16, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> + ret <16 x i32> %1 +} + +define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) { +; RV32-LABEL: multi_chunks_shuffle: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v12, v10, a0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsrl.vv v12, v8, v12 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV32-NEXT: vrgather.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: srai a1, a0, 31 +; RV32-NEXT: ret +; +; RV64-LABEL: multi_chunks_shuffle: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 16, e64, m2, ta, ma +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV64-NEXT: vrgather.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret +entry: + %1 = shufflevector <32 x i32> %0, <32 x i32> zeroinitializer, <32 x i32> + %2 = shufflevector <32 x i32> zeroinitializer, <32 x i32> %1, <32 x i32> + %3 = or <32 x i32> %1, %2 + %4 = extractelement <32 x i32> %3, i64 1 + %conv199 = sext i32 %4 to i64 + ret i64 %conv199 +} From d37aa5135c732b37ae3daab9d9bdcc4c45f7a17d Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 13:09:23 -0500 Subject: [PATCH 106/480] [AMDGPU][True16][MC] true16 for v_not_b16 (#120659) Support true16 format for v_not_b16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 21 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 74 +++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 21 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 18 +- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 14 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 18 +- 28 files changed, 1020 insertions(+), 455 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 92ebd0e10c8fd..30911d45c9e97 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1018,7 +1018,7 @@ defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b, defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>; defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; -defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">; +defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069, "v_not_b16">; defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index fe08042ae5c84..5ceb8ed0065d3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2684,50 +2684,65 @@ v_movrelsd_b32 v255, v255 v_nop // GFX11: v_nop ; encoding: [0x00,0x00,0x00,0x7e] -v_not_b16 v5, v1 -// GFX11: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v1.l +// GFX11: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e] -v_not_b16 v5, v127 -// GFX11: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v127.l +// GFX11: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e] -v_not_b16 v5, s1 -// GFX11: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s1 +// GFX11: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] -v_not_b16 v5, s105 -// GFX11: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s105 +// GFX11: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_lo -// GFX11: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_lo +// GFX11: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_hi -// GFX11: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_hi +// GFX11: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] -v_not_b16 v5, ttmp15 -// GFX11: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, ttmp15 +// GFX11: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] -v_not_b16 v5, m0 -// GFX11: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +v_not_b16 v5.l, m0 +// GFX11: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_lo -// GFX11: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_lo +// GFX11: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_hi -// GFX11: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_hi +// GFX11: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] -v_not_b16 v5, null -// GFX11: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +v_not_b16 v5.l, null +// GFX11: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e] -v_not_b16 v5, -1 -// GFX11: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +v_not_b16 v5.l, -1 +// GFX11: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] -v_not_b16 v5, 0.5 -// GFX11: v_not_b16_e32 v5, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] +v_not_b16 v5.l, 0.5 +// GFX11: v_not_b16_e32 v5.l, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] -v_not_b16 v5, src_scc -// GFX11: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +v_not_b16 v5.l, src_scc +// GFX11: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] -v_not_b16 v127, 0xfe0b -// GFX11: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_not_b16 v127.l, 0xfe0b +// GFX11: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_not_b16 v5.l, v1.h +// GFX11: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e] + +v_not_b16 v5.l, v127.h +// GFX11: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e] + +v_not_b16 v127.l, 0.5 +// GFX11: v_not_b16_e32 v127.l, 0.5 ; encoding: [0xf0,0xd2,0xfe,0x7e] + +v_not_b16 v5.h, src_scc +// GFX11: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f] + +v_not_b16 v127.h, 0xfe0b +// GFX11: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_not_b32 v5, v1 // GFX11: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index f5cf3fd390c7d..4d1bd99b90252 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2144,47 +2144,56 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_not_b16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_not_b16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_not_b16 v5, v1 row_mirror -// GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_not_b16 v5.l, v1.l row_mirror +// GFX11: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_not_b16 v5, v1 row_half_mirror -// GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_not_b16 v5.l, v1.l row_half_mirror +// GFX11: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_not_b16 v5, v1 row_shl:1 -// GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_not_b16 v5, v1 row_shl:15 -// GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:15 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_not_b16 v5, v1 row_shr:1 -// GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_not_b16 v5, v1 row_shr:15 -// GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:15 +// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_not_b16 v5, v1 row_ror:1 -// GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_not_b16 v5, v1 row_ror:15 -// GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:15 +// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] +v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] + +v_not_b16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_not_b32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 5a0ffd04bc5c1..2799ea7b8ef8b 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -500,14 +500,23 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_not_b16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 92882cb89e201..caa73b7b9f047 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -584,6 +584,12 @@ v_log_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_not_b16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v255, v1 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction @@ -593,6 +599,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v5, v199 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction @@ -602,6 +626,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction +v_not_b16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_rcp_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index d97c8ed844dbb..0dd1bf6142189 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1538,71 +1538,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_log_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_not_b16 v128, 0xfe0b -// GFX11: v_not_b16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16 v128.h, 0xfe0b +// GFX11: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, -1 -// GFX11: v_not_b16_e64 v255, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16 v128.l, 0xfe0b +// GFX11: v_not_b16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, 0.5 -// GFX11: v_not_b16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16 v255.h, -1 +// GFX11: v_not_b16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v255, exec_hi -// GFX11: v_not_b16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16 v255.h, 0.5 +// GFX11: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16 v255, exec_lo -// GFX11: v_not_b16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16 v255.h, exec_hi +// GFX11: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16 v255, m0 -// GFX11: v_not_b16_e64 v255, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16 v255.h, exec_lo +// GFX11: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16 v255, null -// GFX11: v_not_b16_e64 v255, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16 v255.h, m0 +// GFX11: v_not_b16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16 v255, s1 -// GFX11: v_not_b16_e64 v255, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16 v255.h, null +// GFX11: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16 v255, s105 -// GFX11: v_not_b16_e64 v255, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16 v255.h, s1 +// GFX11: v_not_b16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16 v255, src_scc -// GFX11: v_not_b16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16 v255.h, s105 +// GFX11: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16 v255, ttmp15 -// GFX11: v_not_b16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16 v255.h, src_scc +// GFX11: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16 v255, v1 -// GFX11: v_not_b16_e64 v255, v1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16 v255.h, ttmp15 +// GFX11: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16 v255.h, v1.h +// GFX11: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16 v255, v127 -// GFX11: v_not_b16_e64 v255, v127 ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] +v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_not_b16 v255.h, v127.h +// GFX11: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00] -v_not_b16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_not_b16 v255, vcc_hi -// GFX11: v_not_b16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_not_b16 v255, vcc_lo -// GFX11: v_not_b16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16 v255.h, vcc_hi +// GFX11: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16 v5, v199 -// GFX11: v_not_b16_e64 v5, v199 ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] +v_not_b16 v255.h, vcc_lo +// GFX11: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_not_b16 v255.l, -1 +// GFX11: v_not_b16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_not_b16 v255.l, 0.5 +// GFX11: v_not_b16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_hi +// GFX11: v_not_b16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_lo +// GFX11: v_not_b16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b16 v255.l, m0 +// GFX11: v_not_b16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b16 v255.l, null +// GFX11: v_not_b16_e64 v255.l, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b16 v255.l, s1 +// GFX11: v_not_b16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +v_not_b16 v255.l, s105 +// GFX11: v_not_b16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +v_not_b16 v255.l, src_scc +// GFX11: v_not_b16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b16 v255.l, ttmp15 +// GFX11: v_not_b16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b16 v255.l, v1.l +// GFX11: v_not_b16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_not_b16 v255.l, v127.l +// GFX11: v_not_b16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] + +v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_not_b16 v255.l, vcc_hi +// GFX11: v_not_b16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b16 v255.l, vcc_lo +// GFX11: v_not_b16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b16 v5.h, v199.h +// GFX11: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_not_b16 v5.l, v199.l +// GFX11: v_not_b16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_rcp_f16 v128, 0xfe0b // GFX11: v_rcp_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 6176baf11c552..8de72e74c2856 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2248,47 +2248,56 @@ v_movrelsd_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_not_b16_e64_dpp v5, v1 row_mirror -// GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] + +v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index f3c8c8a69fbe5..182a13831ec6d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -640,14 +640,23 @@ v_movrelsd_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc4,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 9020017c86106..17678e3bd9f08 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -2674,50 +2674,59 @@ v_movrelsd_b32_e64 v255, v255 v_nop_e64 // GFX11: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] -v_not_b16_e64 v5, v1 -// GFX11: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v1.l +// GFX11: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16_e64 v5, v255 -// GFX11: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v255.l +// GFX11: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] -v_not_b16_e64 v5, s1 -// GFX11: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s1 +// GFX11: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16_e64 v5, s105 -// GFX11: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s105 +// GFX11: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_lo -// GFX11: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_lo +// GFX11: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_hi -// GFX11: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_hi +// GFX11: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16_e64 v5, ttmp15 -// GFX11: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, ttmp15 +// GFX11: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16_e64 v5, m0 -// GFX11: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16_e64 v5.l, m0 +// GFX11: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_lo -// GFX11: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_lo +// GFX11: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_hi -// GFX11: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_hi +// GFX11: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16_e64 v5, null -// GFX11: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16_e64 v5.l, null +// GFX11: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16_e64 v5, -1 -// GFX11: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16_e64 v5.l, -1 +// GFX11: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16_e64 v5, 0.5 -// GFX11: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16_e64 v5.l, 0.5 +// GFX11: v_not_b16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16_e64 v5, src_scc -// GFX11: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16_e64 v5.l, src_scc +// GFX11: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16_e64 v255, 0xfe0b -// GFX11: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16_e64 v255.l, 0xfe0b +// GFX11: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_not_b16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] + +v_not_b16_e64 v255.h, 0xfe0b +// GFX11: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] v_not_b32_e64 v5, v1 // GFX11: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index b125821d1306e..4f82643fd4886 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2759,51 +2759,63 @@ v_movrelsd_b32 v255, v255 v_nop // GFX12: v_nop ; encoding: [0x00,0x00,0x00,0x7e] -v_not_b16 v5, v1 -// GFX12: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v1.l +// GFX12: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e] -v_not_b16 v5, v127 -// GFX12: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] +v_not_b16 v5.l, v127.l +// GFX12: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e] -v_not_b16 v5, s1 -// GFX12: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s1 +// GFX12: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] -v_not_b16 v5, s105 -// GFX12: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +v_not_b16 v5.l, s105 +// GFX12: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_lo -// GFX12: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_lo +// GFX12: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] -v_not_b16 v5, vcc_hi -// GFX12: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, vcc_hi +// GFX12: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] -v_not_b16 v5, ttmp15 -// GFX12: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +v_not_b16 v5.l, ttmp15 +// GFX12: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] -v_not_b16 v5, m0 -// GFX12: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +v_not_b16 v5.l, m0 +// GFX12: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_lo -// GFX12: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_lo +// GFX12: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] -v_not_b16 v5, exec_hi -// GFX12: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +v_not_b16 v5.l, exec_hi +// GFX12: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] -v_not_b16 v5, null -// GFX12: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +v_not_b16 v5.l, null +// GFX12: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e] -v_not_b16 v5, -1 -// GFX12: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +v_not_b16 v5.l, -1 +// GFX12: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] -v_not_b16 v5, 0.5 -// GFX12-ASM: v_not_b16_e32 v5, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] -// GFX12-DIS: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] +v_not_b16 v5.l, 0.5 +// GFX12-ASM: v_not_b16_e32 v5.l, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e] +// GFX12-DIS: v_not_b16_e32 v5.l, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] -v_not_b16 v5, src_scc -// GFX12: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +v_not_b16 v5.l, src_scc +// GFX12: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] -v_not_b16 v127, 0xfe0b -// GFX12: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_not_b16 v127.l, 0xfe0b +// GFX12: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_not_b16 v5.l, v1.h +// GFX12: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e] + +v_not_b16 v5.l, v127.h +// GFX12: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e] + +v_not_b16 v5.h, src_scc +// GFX12: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f] + +v_not_b16 v127.h, 0xfe0b +// GFX12: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_not_b32 v5, v1 // GFX12: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index a625326c1dae4..2b3a52cf4e804 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2212,47 +2212,53 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_not_b16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_not_b16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_not_b16 v5, v1 row_mirror -// GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_not_b16 v5.l, v1.l row_mirror +// GFX12: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_not_b16 v5, v1 row_half_mirror -// GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_not_b16 v5.l, v1.l row_half_mirror +// GFX12: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_not_b16 v5, v1 row_shl:1 -// GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_not_b16 v5, v1 row_shl:15 -// GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shl:15 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_not_b16 v5, v1 row_shr:1 -// GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_not_b16 v5, v1 row_shr:15 -// GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_not_b16 v5.l, v1.l row_shr:15 +// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_not_b16 v5, v1 row_ror:1 -// GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_not_b16 v5, v1 row_ror:15 -// GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_not_b16 v5.l, v1.l row_ror:15 +// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] +v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30] + +v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_not_b32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 9281d6fb16ce8..977d5b08b80ee 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -529,14 +529,20 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 33a5dded095c7..1b6734a6a652b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -536,6 +536,12 @@ v_log_f16_e32 v5, v199 quad_perm:[3,2,1,0] v_not_b16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v255, v1 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction @@ -545,6 +551,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction +v_not_b16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_not_b16_e32 v5, v199 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction @@ -554,6 +578,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction +v_not_b16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_rcp_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 03519d43c49a9..9d36ea0b9f479 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1498,71 +1498,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_log_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_not_b16 v128, 0xfe0b -// GFX12: v_not_b16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16 v128.h, 0xfe0b +// GFX12: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, -1 -// GFX12: v_not_b16_e64 v255, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16 v128.l, 0xfe0b +// GFX12: v_not_b16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_not_b16 v255, 0.5 -// GFX12: v_not_b16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16 v255.h, -1 +// GFX12: v_not_b16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v255, exec_hi -// GFX12: v_not_b16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16 v255.h, 0.5 +// GFX12: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16 v255, exec_lo -// GFX12: v_not_b16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16 v255.h, exec_hi +// GFX12: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16 v255, m0 -// GFX12: v_not_b16_e64 v255, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16 v255.h, exec_lo +// GFX12: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16 v255, null -// GFX12: v_not_b16_e64 v255, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16 v255.h, m0 +// GFX12: v_not_b16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16 v255, s1 -// GFX12: v_not_b16_e64 v255, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16 v255.h, null +// GFX12: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16 v255, s105 -// GFX12: v_not_b16_e64 v255, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16 v255.h, s1 +// GFX12: v_not_b16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16 v255, src_scc -// GFX12: v_not_b16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16 v255.h, s105 +// GFX12: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16 v255, ttmp15 -// GFX12: v_not_b16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16 v255.h, src_scc +// GFX12: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16 v255, v1 -// GFX12: v_not_b16_e64 v255, v1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16 v255.h, ttmp15 +// GFX12: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16 v255.h, v1.h +// GFX12: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16 v255, v127 -// GFX12: v_not_b16_e64 v255, v127 ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] +v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_not_b16 v255.h, v127.h +// GFX12: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00] -v_not_b16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_not_b16 v255, vcc_hi -// GFX12: v_not_b16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_not_b16 v255, vcc_lo -// GFX12: v_not_b16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16 v255.h, vcc_hi +// GFX12: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16 v5, v199 -// GFX12: v_not_b16_e64 v5, v199 ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] +v_not_b16 v255.h, vcc_lo +// GFX12: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_not_b16 v255.l, -1 +// GFX12: v_not_b16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_not_b16 v255.l, 0.5 +// GFX12: v_not_b16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_hi +// GFX12: v_not_b16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b16 v255.l, exec_lo +// GFX12: v_not_b16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b16 v255.l, m0 +// GFX12: v_not_b16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b16 v255.l, null +// GFX12: v_not_b16_e64 v255.l, null ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b16 v255.l, s1 +// GFX12: v_not_b16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +v_not_b16 v255.l, s105 +// GFX12: v_not_b16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +v_not_b16 v255.l, src_scc +// GFX12: v_not_b16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b16 v255.l, ttmp15 +// GFX12: v_not_b16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b16 v255.l, v1.l +// GFX12: v_not_b16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_not_b16 v255.l, v127.l +// GFX12: v_not_b16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00] + +v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_not_b16 v255.l, vcc_hi +// GFX12: v_not_b16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b16 v255.l, vcc_lo +// GFX12: v_not_b16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b16 v5.h, v199.h +// GFX12: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_not_b16 v5.l, v199.l +// GFX12: v_not_b16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00] + +v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_rcp_f16 v128, 0xfe0b // GFX12: v_rcp_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e2fe08ddc8b06..71c12a1333ebc 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -2824,50 +2824,59 @@ v_movrelsd_b32_e64 v255, v255 v_nop_e64 // GFX12: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] -v_not_b16_e64 v5, v1 -// GFX12: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v1.l +// GFX12: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] -v_not_b16_e64 v5, v255 -// GFX12: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +v_not_b16_e64 v5.l, v255.l +// GFX12: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] -v_not_b16_e64 v5, s1 -// GFX12: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s1 +// GFX12: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] -v_not_b16_e64 v5, s105 -// GFX12: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +v_not_b16_e64 v5.l, s105 +// GFX12: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_lo -// GFX12: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_lo +// GFX12: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] -v_not_b16_e64 v5, vcc_hi -// GFX12: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, vcc_hi +// GFX12: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] -v_not_b16_e64 v5, ttmp15 -// GFX12: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +v_not_b16_e64 v5.l, ttmp15 +// GFX12: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] -v_not_b16_e64 v5, m0 -// GFX12: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +v_not_b16_e64 v5.l, m0 +// GFX12: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_lo -// GFX12: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_lo +// GFX12: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] -v_not_b16_e64 v5, exec_hi -// GFX12: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +v_not_b16_e64 v5.l, exec_hi +// GFX12: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] -v_not_b16_e64 v5, null -// GFX12: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +v_not_b16_e64 v5.l, null +// GFX12: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] -v_not_b16_e64 v5, -1 -// GFX12: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +v_not_b16_e64 v5.l, -1 +// GFX12: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] -v_not_b16_e64 v5, 0.5 -// GFX12: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] +v_not_b16_e64 v5.l, 0.5 +// GFX12: v_not_b16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] -v_not_b16_e64 v5, src_scc -// GFX12: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +v_not_b16_e64 v5.l, src_scc +// GFX12: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] -v_not_b16_e64 v255, 0xfe0b -// GFX12: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_not_b16_e64 v255.l, 0xfe0b +// GFX12: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_not_b16_e64 v5.h, v1.h +// GFX12: v_not_b16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16_e64 v5.l, v255.h +// GFX12: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] + +v_not_b16_e64 v255.h, 0xfe0b +// GFX12: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] v_not_b32_e64 v5, v1 // GFX12: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 3fff2749e6e99..42166032124a3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2128,47 +2128,56 @@ v_movrels_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_not_b16_e64_dpp v5, v1 row_mirror -// GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] + +v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index e4ae0ad655518..d65d2004fc1e7 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -619,14 +619,23 @@ v_movrels_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc3,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 8cf2c2b4f2d1e..38c573a19ba00 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -2638,49 +2638,82 @@ # GFX11: v_nop ; encoding: [0x00,0x00,0x00,0x7e] 0x01,0xd3,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e] 0x7f,0xd3,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e] 0x01,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e] 0x69,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e] 0x6a,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e] 0x6b,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e] 0x7b,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e] 0x7d,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e] 0x7e,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e] 0x7f,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e] 0x7c,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e] 0xc1,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e] 0xf0,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_not_b16_e32 v5.l, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00] 0xfd,0xd2,0x0a,0x7e -# GFX11: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +# GFX11-REAL16: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e] 0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xd3,0x0a,0x7e +# GFX11-REAL16: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd3,0x0a,0x7e] + +0xff,0xd3,0x0a,0x7e +# GFX11-REAL16: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e] +# GFX11-FAKE16: v_not_b16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd3,0x0a,0x7e] + +0xf0,0xd2,0xfe,0x7e +# GFX11-REAL16: v_not_b16_e32 v127.l, 0x3800 ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e32 v127, 0x3800 ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00] + +0xfd,0xd2,0x0a,0x7f +# GFX11-REAL16: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f] + +0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x6f,0x0a,0x7e # GFX11: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index b9a499549d12c..b801e393c635d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2119,46 +2119,72 @@ # GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30 -# GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 80c739a98f65f..faf3c6f628b95 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -398,10 +398,23 @@ # GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index fd84ed734fb31..f689c43b75365 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2257,46 +2257,72 @@ # GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] + +0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 0edbff63d60ed..48824399a0887 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -599,10 +599,24 @@ # GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 0406d78078305..04c9094465b3b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -2658,49 +2658,76 @@ # GFX11: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00 -# GFX11: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 22ae18815a522..b93a6252beaeb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2239,46 +2239,68 @@ # GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30 -# GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index bfb84c6cdff39..092ba9b88f951 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -411,10 +411,20 @@ # GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00] + 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index e27469230a15f..7fdb9e0ac6977 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -2707,49 +2707,76 @@ # GFX12: v_movrelsd_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00 -# GFX12: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index bc957576b19b6..ad491dc02d384 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2125,46 +2125,72 @@ # GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] + +0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 989824315b2d2..21b4d0572bf37 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -587,10 +587,24 @@ # GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From a4e47586b9c0566761b7fb704011da6ded823398 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 10:23:13 -0800 Subject: [PATCH 107/480] [ExpandMemCmp] Recognize canonical form of (icmp sle/sge X, 0) in getMemCmpOneBlock. (#121540) This code recognizes special cases where the result of memcmp is compared with 0. If the compare is sle/sge, then InstCombine canonicalizes to (icmp slt X, 1) or (icmp sgt X, -1). We should recognize those patterns too. --- llvm/lib/CodeGen/ExpandMemCmp.cpp | 8 +++++++ llvm/test/CodeGen/AArch64/memcmp.ll | 10 ++------ llvm/test/CodeGen/RISCV/memcmp.ll | 36 +++++++---------------------- llvm/test/CodeGen/X86/memcmp.ll | 12 ++-------- 4 files changed, 20 insertions(+), 46 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index cc75a01c6477a..74f93e1979532 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -680,6 +680,14 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { m_SpecificInt(CI->getType()->getIntegerBitWidth() - 1)))) { Pred = ICmpInst::ICMP_SLT; NeedsZExt = true; + } else if (match(UI, m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(CI), + m_AllOnes()))) { + // Adjust predicate as if it compared with 0. + Pred = ICmpInst::ICMP_SGE; + } else if (match(UI, m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(CI), + m_One()))) { + // Adjust predicate as if it compared with 0. + Pred = ICmpInst::ICMP_SLE; } else { // In case of a successful match this call will set `Pred` variable match(UI, m_ICmp(Pred, m_Specific(CI), m_Zero())); diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll index 864f38468842a..98ea86b06d6c5 100644 --- a/llvm/test/CodeGen/AArch64/memcmp.ll +++ b/llvm/test/CodeGen/AArch64/memcmp.ll @@ -265,10 +265,7 @@ define i1 @length4_le(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: csinv w8, w8, wzr, hs -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: cset w0, ls ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp slt i32 %m, 1 @@ -283,10 +280,7 @@ define i1 @length4_ge(ptr %X, ptr %Y) nounwind { ; CHECK-NEXT: rev w8, w8 ; CHECK-NEXT: rev w9, w9 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: csinv w8, w8, wzr, hs -; CHECK-NEXT: mvn w8, w8 -; CHECK-NEXT: lsr w0, w8, #31 +; CHECK-NEXT: cset w0, hs ; CHECK-NEXT: ret %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp sgt i32 %m, -1 diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 5adda28acb427..f0290298e362a 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -6664,10 +6664,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_le_zero: @@ -6678,10 +6676,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero: @@ -6690,10 +6686,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero: @@ -6704,10 +6698,8 @@ define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_le_zero: @@ -6864,10 +6856,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret ; @@ -6879,10 +6868,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret ; @@ -6892,10 +6878,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret ; @@ -6907,10 +6890,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slti a0, a0, 0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xori a0, a0, 1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index e744d2a06e55f..bb089e5ddda87 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -268,11 +268,7 @@ define i1 @length4_le(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: seta %al -; X64-NEXT: sbbb $0, %al -; X64-NEXT: movsbl %al, %eax -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setle %al +; X64-NEXT: setbe %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp slt i32 %m, 1 @@ -287,11 +283,7 @@ define i1 @length4_ge(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: seta %al -; X64-NEXT: sbbb $0, %al -; X64-NEXT: movsbl %al, %eax -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setns %al +; X64-NEXT: setae %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind %c = icmp sgt i32 %m, -1 From 39a9073f9eb71ac610cbafe7eed05ca668871b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 3 Jan 2025 10:35:11 -0800 Subject: [PATCH 108/480] [flang][cuda] Downgrade allocate pinned error to a warning (#121589) To be in accordance with the reference compiler. --- flang/lib/Semantics/check-allocate.cpp | 6 ++++-- flang/test/Semantics/cuf07.cuf | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp index 1e5412324916d..223bee6eb6f11 100644 --- a/flang/lib/Semantics/check-allocate.cpp +++ b/flang/lib/Semantics/check-allocate.cpp @@ -616,9 +616,11 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { } if (allocateInfo_.gotPinned) { std::optional cudaAttr{GetCUDADataAttr(ultimate_)}; - if (!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) { + if ((!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) && + context.languageFeatures().ShouldWarn( + common::UsageWarning::CUDAUsage)) { context.Say(name_.source, - "Object in ALLOCATE must have PINNED attribute when PINNED option is specified"_err_en_US); + "Object in ALLOCATE should have PINNED attribute when PINNED option is specified"_warn_en_US); } } if (allocateInfo_.gotStream) { diff --git a/flang/test/Semantics/cuf07.cuf b/flang/test/Semantics/cuf07.cuf index c48abb5adf0d4..56b2164532ae2 100644 --- a/flang/test/Semantics/cuf07.cuf +++ b/flang/test/Semantics/cuf07.cuf @@ -28,7 +28,7 @@ module m integer, allocatable, device :: ia(:) logical :: plog - !ERROR: Object in ALLOCATE must have PINNED attribute when PINNED option is specified + !WARNING: Object in ALLOCATE should have PINNED attribute when PINNED option is specified allocate(ia(100), pinned = plog) end subroutine From 0844f83fea66332943deed7cdf97b686b2c7c37b Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Fri, 3 Jan 2025 19:36:24 +0100 Subject: [PATCH 109/480] [clang][analyzer] Stable order for SymbolRef-keyed containers (#121551) Generalize the `SymbolID`s used for `SymbolData` to all `SymExpr`s and use these IDs for comparison `SymbolRef` keys in various containers, such as `ConstraintMap`. These IDs are superior to raw pointer values because they are more controllable and are not randomized across executions (unlike [pointers](https://en.wikipedia.org/wiki/Address_space_layout_randomization)). These IDs order is stable across runs because SymExprs are allocated in the same order. Stability of the constraint order is important for the stability of the analyzer results. I evaluated this change on a set of 200+ open-source C and C++ projects with the total number of ~78 000 symbolic-execution issues passing Z3 refutation. This patch reduced the run-to-run churn (flakiness) in SE issues from 80-90 to 30-40 (out of 78K) in our CSA deployment (in our setting flaky issues are mostly due to Z3 refutation instability). Note, most of the issue churn (flakiness) is caused by the mentioned Z3 refutation. With Z3 refutation disabled, issue churn goes down to ~10 issues out of 83K and this patch has no effect on appearing/disappearing issues between runs. It however, seems to reduce the volatility of the execution flow: before we had 40-80 issues with changed execution flow, after - 10-30. Importantly, this change is necessary for the next step in stabilizing analysis results by caching Z3 query outcomes between analysis runs (work in progress). Across our admittedly noisy CI runs, I detected no significant effect on memory footprint or analysis time. CPP-5919 --- .../Core/PathSensitive/SymExpr.h | 31 ++++-- .../Core/PathSensitive/SymbolManager.h | 100 ++++++++++++++---- .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 25 ++--- clang/test/Analysis/dump_egraph.cpp | 2 +- .../expr-inspection-printState-diseq-info.c | 12 +-- .../expr-inspection-printState-eq-classes.c | 4 +- clang/test/Analysis/ptr-arith.cpp | 4 +- ...symbol-simplification-disequality-info.cpp | 20 ++-- ...-simplification-fixpoint-one-iteration.cpp | 12 +-- ...simplification-fixpoint-two-iterations.cpp | 18 ++-- clang/test/Analysis/unary-sym-expr.c | 6 +- 11 files changed, 149 insertions(+), 85 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h index 862a30c0e7363..aca14cf813c4b 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h @@ -25,6 +25,8 @@ namespace ento { class MemRegion; +using SymbolID = unsigned; + /// Symbolic value. These values used to capture symbolic execution of /// the program. class SymExpr : public llvm::FoldingSetNode { @@ -39,9 +41,19 @@ class SymExpr : public llvm::FoldingSetNode { private: Kind K; + /// A unique identifier for this symbol. + /// + /// It is useful for SymbolData to easily differentiate multiple symbols, but + /// also for "ephemeral" symbols, such as binary operations, because this id + /// can be used for arranging constraints or equivalence classes instead of + /// unstable pointer values. + /// + /// Note, however, that it can't be used in Profile because SymbolManager + /// needs to compute Profile before allocating SymExpr. + const SymbolID Sym; protected: - SymExpr(Kind k) : K(k) {} + SymExpr(Kind k, SymbolID Sym) : K(k), Sym(Sym) {} static bool isValidTypeForSymbol(QualType T) { // FIXME: Depending on whether we choose to deprecate structural symbols, @@ -56,6 +68,14 @@ class SymExpr : public llvm::FoldingSetNode { Kind getKind() const { return K; } + /// Get a unique identifier for this symbol. + /// The ID is unique across all SymExprs in a SymbolManager. + /// They reflect the allocation order of these SymExprs, + /// and are likely stable across runs. + /// Used as a key in SymbolRef containers and as part of identity + /// for SymbolData, e.g. SymbolConjured with ID = 7 is "conj_$7". + SymbolID getSymbolID() const { return Sym; } + virtual void dump() const; virtual void dumpToStream(raw_ostream &os) const {} @@ -112,19 +132,14 @@ inline raw_ostream &operator<<(raw_ostream &os, using SymbolRef = const SymExpr *; using SymbolRefSmallVectorTy = SmallVector; -using SymbolID = unsigned; /// A symbol representing data which can be stored in a memory location /// (region). class SymbolData : public SymExpr { - const SymbolID Sym; - void anchor() override; protected: - SymbolData(Kind k, SymbolID sym) : SymExpr(k), Sym(sym) { - assert(classof(this)); - } + SymbolData(Kind k, SymbolID sym) : SymExpr(k, sym) { assert(classof(this)); } public: ~SymbolData() override = default; @@ -132,8 +147,6 @@ class SymbolData : public SymExpr { /// Get a string representation of the kind of the region. virtual StringRef getKindStr() const = 0; - SymbolID getSymbolID() const { return Sym; } - unsigned computeComplexity() const override { return 1; }; diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h index 73732d532f630..b57f415ec139f 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h @@ -25,6 +25,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" #include @@ -43,15 +44,16 @@ class StoreManager; class SymbolRegionValue : public SymbolData { const TypedValueRegion *R; -public: + friend class SymExprAllocator; SymbolRegionValue(SymbolID sym, const TypedValueRegion *r) : SymbolData(SymbolRegionValueKind, sym), R(r) { assert(r); assert(isValidTypeForSymbol(r->getValueType())); } +public: LLVM_ATTRIBUTE_RETURNS_NONNULL - const TypedValueRegion* getRegion() const { return R; } + const TypedValueRegion *getRegion() const { return R; } static void Profile(llvm::FoldingSetNodeID& profile, const TypedValueRegion* R) { profile.AddInteger((unsigned) SymbolRegionValueKind); @@ -84,7 +86,7 @@ class SymbolConjured : public SymbolData { const LocationContext *LCtx; const void *SymbolTag; -public: + friend class SymExprAllocator; SymbolConjured(SymbolID sym, const Stmt *s, const LocationContext *lctx, QualType t, unsigned count, const void *symbolTag) : SymbolData(SymbolConjuredKind, sym), S(s), T(t), Count(count), @@ -98,6 +100,7 @@ class SymbolConjured : public SymbolData { assert(isValidTypeForSymbol(t)); } +public: /// It might return null. const Stmt *getStmt() const { return S; } unsigned getCount() const { return Count; } @@ -137,7 +140,7 @@ class SymbolDerived : public SymbolData { SymbolRef parentSymbol; const TypedValueRegion *R; -public: + friend class SymExprAllocator; SymbolDerived(SymbolID sym, SymbolRef parent, const TypedValueRegion *r) : SymbolData(SymbolDerivedKind, sym), parentSymbol(parent), R(r) { assert(parent); @@ -145,6 +148,7 @@ class SymbolDerived : public SymbolData { assert(isValidTypeForSymbol(r->getValueType())); } +public: LLVM_ATTRIBUTE_RETURNS_NONNULL SymbolRef getParentSymbol() const { return parentSymbol; } LLVM_ATTRIBUTE_RETURNS_NONNULL @@ -180,12 +184,13 @@ class SymbolDerived : public SymbolData { class SymbolExtent : public SymbolData { const SubRegion *R; -public: + friend class SymExprAllocator; SymbolExtent(SymbolID sym, const SubRegion *r) : SymbolData(SymbolExtentKind, sym), R(r) { assert(r); } +public: LLVM_ATTRIBUTE_RETURNS_NONNULL const SubRegion *getRegion() const { return R; } @@ -222,7 +227,7 @@ class SymbolMetadata : public SymbolData { unsigned Count; const void *Tag; -public: + friend class SymExprAllocator; SymbolMetadata(SymbolID sym, const MemRegion* r, const Stmt *s, QualType t, const LocationContext *LCtx, unsigned count, const void *tag) : SymbolData(SymbolMetadataKind, sym), R(r), S(s), T(t), LCtx(LCtx), @@ -234,6 +239,7 @@ class SymbolMetadata : public SymbolData { assert(tag); } + public: LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion *getRegion() const { return R; } @@ -286,15 +292,16 @@ class SymbolCast : public SymExpr { /// The type of the result. QualType ToTy; -public: - SymbolCast(const SymExpr *In, QualType From, QualType To) - : SymExpr(SymbolCastKind), Operand(In), FromTy(From), ToTy(To) { + friend class SymExprAllocator; + SymbolCast(SymbolID Sym, const SymExpr *In, QualType From, QualType To) + : SymExpr(SymbolCastKind, Sym), Operand(In), FromTy(From), ToTy(To) { assert(In); assert(isValidTypeForSymbol(From)); // FIXME: GenericTaintChecker creates symbols of void type. // Otherwise, 'To' should also be a valid type. } +public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -332,9 +339,10 @@ class UnarySymExpr : public SymExpr { UnaryOperator::Opcode Op; QualType T; -public: - UnarySymExpr(const SymExpr *In, UnaryOperator::Opcode Op, QualType T) - : SymExpr(UnarySymExprKind), Operand(In), Op(Op), T(T) { + friend class SymExprAllocator; + UnarySymExpr(SymbolID Sym, const SymExpr *In, UnaryOperator::Opcode Op, + QualType T) + : SymExpr(UnarySymExprKind, Sym), Operand(In), Op(Op), T(T) { // Note, some unary operators are modeled as a binary operator. E.g. ++x is // modeled as x + 1. assert((Op == UO_Minus || Op == UO_Not) && "non-supported unary expression"); @@ -345,6 +353,7 @@ class UnarySymExpr : public SymExpr { assert(!Loc::isLocType(T) && "unary symbol should be nonloc"); } +public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -381,8 +390,8 @@ class BinarySymExpr : public SymExpr { QualType T; protected: - BinarySymExpr(Kind k, BinaryOperator::Opcode op, QualType t) - : SymExpr(k), Op(op), T(t) { + BinarySymExpr(SymbolID Sym, Kind k, BinaryOperator::Opcode op, QualType t) + : SymExpr(k, Sym), Op(op), T(t) { assert(classof(this)); // Binary expressions are results of arithmetic. Pointer arithmetic is not // handled by binary expressions, but it is instead handled by applying @@ -425,14 +434,15 @@ class BinarySymExprImpl : public BinarySymExpr { LHSTYPE LHS; RHSTYPE RHS; -public: - BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs, - QualType t) - : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) { + friend class SymExprAllocator; + BinarySymExprImpl(SymbolID Sym, LHSTYPE lhs, BinaryOperator::Opcode op, + RHSTYPE rhs, QualType t) + : BinarySymExpr(Sym, ClassKind, op, t), LHS(lhs), RHS(rhs) { assert(getPointer(lhs)); assert(getPointer(rhs)); } +public: void dumpToStream(raw_ostream &os) const override { dumpToStreamImpl(os, LHS); dumpToStreamImpl(os, getOpcode()); @@ -478,6 +488,21 @@ using IntSymExpr = BinarySymExprImpl; +class SymExprAllocator { + SymbolID NextSymbolID = 0; + llvm::BumpPtrAllocator &Alloc; + +public: + explicit SymExprAllocator(llvm::BumpPtrAllocator &Alloc) : Alloc(Alloc) {} + + template SymT *make(ArgsT &&...Args) { + return new (Alloc) SymT(nextID(), std::forward(Args)...); + } + +private: + SymbolID nextID() { return NextSymbolID++; } +}; + class SymbolManager { using DataSetTy = llvm::FoldingSet; using SymbolDependTy = @@ -489,15 +514,14 @@ class SymbolManager { /// alive as long as the key is live. SymbolDependTy SymbolDependencies; - unsigned SymbolCounter = 0; - llvm::BumpPtrAllocator& BPAlloc; + SymExprAllocator Alloc; BasicValueFactory &BV; ASTContext &Ctx; public: SymbolManager(ASTContext &ctx, BasicValueFactory &bv, - llvm::BumpPtrAllocator& bpalloc) - : SymbolDependencies(16), BPAlloc(bpalloc), BV(bv), Ctx(ctx) {} + llvm::BumpPtrAllocator &bpalloc) + : SymbolDependencies(16), Alloc(bpalloc), BV(bv), Ctx(ctx) {} static bool canSymbolicate(QualType T); @@ -687,4 +711,36 @@ class SymbolVisitor { } // namespace clang +// Override the default definition that would use pointer values of SymbolRefs +// to order them, which is unstable due to ASLR. +// Use the SymbolID instead which reflect the order in which the symbols were +// allocated. This is usually stable across runs leading to the stability of +// ConstraintMap and other containers using SymbolRef as keys. +template <> +struct ::llvm::ImutContainerInfo + : public ImutProfileInfo { + using value_type = clang::ento::SymbolRef; + using value_type_ref = clang::ento::SymbolRef; + using key_type = value_type; + using key_type_ref = value_type_ref; + using data_type = bool; + using data_type_ref = bool; + + static key_type_ref KeyOfValue(value_type_ref D) { return D; } + static data_type_ref DataOfValue(value_type_ref) { return true; } + + static bool isEqual(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { + return LHS->getSymbolID() == RHS->getSymbolID(); + } + + static bool isLess(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { + return LHS->getSymbolID() < RHS->getSymbolID(); + } + + // This might seem redundant, but it is required because of the way + // ImmutableSet is implemented through AVLTree: + // same as ImmutableMap, but with a non-informative "data". + static bool isDataEqual(data_type_ref, data_type_ref) { return true; } +}; + #endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SYMBOLMANAGER_H diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp index f21e5c3ad7bd7..738b6a175ce6d 100644 --- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp @@ -170,9 +170,8 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolRegionValue(SymbolCounter, R); + SD = Alloc.make(R); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -188,9 +187,8 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag); + SD = Alloc.make(E, LCtx, T, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -204,9 +202,8 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolDerived(SymbolCounter, parentSymbol, R); + SD = Alloc.make(parentSymbol, R); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -219,9 +216,8 @@ SymbolManager::getExtentSymbol(const SubRegion *R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolExtent(SymbolCounter, R); + SD = Alloc.make(R); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -236,9 +232,8 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = new (BPAlloc) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag); + SD = Alloc.make(R, S, T, LCtx, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); - ++SymbolCounter; } return cast(SD); @@ -252,7 +247,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) SymbolCast(Op, From, To); + data = Alloc.make(Op, From, To); DataSet.InsertNode(data, InsertPos); } @@ -268,7 +263,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) SymIntExpr(lhs, op, v, t); + data = Alloc.make(lhs, op, v, t); DataSet.InsertNode(data, InsertPos); } @@ -284,7 +279,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) IntSymExpr(lhs, op, rhs, t); + data = Alloc.make(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -301,7 +296,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) SymSymExpr(lhs, op, rhs, t); + data = Alloc.make(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -316,7 +311,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = new (BPAlloc) UnarySymExpr(Operand, Opc, T); + data = Alloc.make(Operand, Opc, T); DataSet.InsertNode(data, InsertPos); } diff --git a/clang/test/Analysis/dump_egraph.cpp b/clang/test/Analysis/dump_egraph.cpp index d1229b2634674..13459699a06f6 100644 --- a/clang/test/Analysis/dump_egraph.cpp +++ b/clang/test/Analysis/dump_egraph.cpp @@ -21,7 +21,7 @@ void foo() { // CHECK: \"location_context\": \"#0 Call\", \"calling\": \"T::T\", \"location\": \{ \"line\": 15, \"column\": 5, \"file\": \"{{.*}}dump_egraph.cpp\" \}, \"items\": [\l        \{ \"init_id\": {{[0-9]+}}, \"kind\": \"construct into member variable\", \"argument_index\": null, \"pretty\": \"s\", \"value\": \"&t.s\" -// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$2\{int, LC5, no stmt, #1\}\" +// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$3\{int, LC5, no stmt, #1\}\" // CHECK: \"dynamic_types\": [\l      \{ \"region\": \"HeapSymRegion\{conj_$1\{S *, LC1, S{{[0-9]+}}, #1\}\}\", \"dyn_type\": \"S\", \"sub_classable\": false \}\l diff --git a/clang/test/Analysis/expr-inspection-printState-diseq-info.c b/clang/test/Analysis/expr-inspection-printState-diseq-info.c index c5c31785a600e..515fcbbd43079 100644 --- a/clang/test/Analysis/expr-inspection-printState-diseq-info.c +++ b/clang/test/Analysis/expr-inspection-printState-diseq-info.c @@ -18,17 +18,17 @@ void test_disequality_info(int e0, int b0, int b1, int c0) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "(reg_$0) - 2" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$2" ]] + // CHECK-NEXT: [ "reg_$7" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$2" ], + // CHECK-NEXT: "class": [ "reg_$15" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) - 2" ], - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$7" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$7" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$2" ]] + // CHECK-NEXT: [ "(reg_$0) - 2" ], + // CHECK-NEXT: [ "reg_$15" ]] // CHECK-NEXT: } // CHECK-NEXT: ], diff --git a/clang/test/Analysis/expr-inspection-printState-eq-classes.c b/clang/test/Analysis/expr-inspection-printState-eq-classes.c index 38e23d6e83826..19cc13735ab5a 100644 --- a/clang/test/Analysis/expr-inspection-printState-eq-classes.c +++ b/clang/test/Analysis/expr-inspection-printState-eq-classes.c @@ -16,6 +16,6 @@ void test_equivalence_classes(int a, int b, int c, int d) { } // CHECK: "equivalence_classes": [ -// CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], -// CHECK-NEXT: [ "reg_$0", "reg_$2", "reg_$3" ] +// CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], +// CHECK-NEXT: [ "reg_$0", "reg_$20", "reg_$5" ] // CHECK-NEXT: ], diff --git a/clang/test/Analysis/ptr-arith.cpp b/clang/test/Analysis/ptr-arith.cpp index a1264a1f04839..ec1c75c0c4063 100644 --- a/clang/test/Analysis/ptr-arith.cpp +++ b/clang/test/Analysis/ptr-arith.cpp @@ -139,10 +139,10 @@ struct parse_t { int parse(parse_t *p) { unsigned copy = p->bits2; clang_analyzer_dump(copy); - // expected-warning@-1 {{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>}} + // expected-warning@-1 {{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>}} header *bits = (header *)© clang_analyzer_dump(bits->b); - // expected-warning@-1 {{derived_$2{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} + // expected-warning@-1 {{derived_$4{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} return bits->b; // no-warning } } // namespace Bug_55934 diff --git a/clang/test/Analysis/symbol-simplification-disequality-info.cpp b/clang/test/Analysis/symbol-simplification-disequality-info.cpp index 69238b583eb84..33b8f150f5d02 100644 --- a/clang/test/Analysis/symbol-simplification-disequality-info.cpp +++ b/clang/test/Analysis/symbol-simplification-disequality-info.cpp @@ -14,14 +14,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$1)) + (reg_$2)" ], + // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$2)) + (reg_$5)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$8" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$8" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)" ]] + // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -32,14 +32,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$2)" ], + // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$5)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$8" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$8" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$2)" ]] + // CHECK-NEXT: [ "(reg_$0) + (reg_$5)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -50,10 +50,10 @@ void test(int a, int b, int c, int d) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "reg_$0" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$3" ]] + // CHECK-NEXT: [ "reg_$8" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$3" ], + // CHECK-NEXT: "class": [ "reg_$8" ], // CHECK-NEXT: "disequal_to": [ // CHECK-NEXT: [ "reg_$0" ]] // CHECK-NEXT: } diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp index 73922d420a8c3..42e984762538e 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp @@ -13,10 +13,10 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$1)) != (reg_$2)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$2)) != (reg_$5)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$1)", "reg_$2" ] + // CHECK-NEXT: [ "(reg_$0) + (reg_$2)", "reg_$5" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -25,12 +25,12 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$2)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$5)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$2" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$5" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp index 679ed3fda7a7a..cffb5a70869eb 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp @@ -15,11 +15,11 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$1)) + (reg_$2)) != (reg_$3)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "(reg_$2) + (reg_$1)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$2)) + (reg_$5)) != (reg_$8)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "(reg_$5) + (reg_$2)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)", "reg_$3" ] + // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)", "reg_$8" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -28,14 +28,14 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$3)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$8)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$5", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$3)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$3" ], - // CHECK-NEXT: [ "reg_$2" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$8)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$8" ], + // CHECK-NEXT: [ "reg_$5" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c index 92e11b295bee7..64a01a956c442 100644 --- a/clang/test/Analysis/unary-sym-expr.c +++ b/clang/test/Analysis/unary-sym-expr.c @@ -11,9 +11,9 @@ int test(int x, int y) { clang_analyzer_dump(-x); // expected-warning{{-reg_$0}} clang_analyzer_dump(~x); // expected-warning{{~reg_$0}} int z = x + y; - clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$1))}} - clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$1))}} - clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$1)}} + clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$3))}} + clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$3))}} + clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$3)}} if (-x == 0) { clang_analyzer_eval(-x == 0); // expected-warning{{TRUE}} From a106ad0f1d0f74fde3591149c63f3e94ec780fef Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Fri, 3 Jan 2025 19:43:24 +0100 Subject: [PATCH 110/480] Revert "[clang][analyzer] Stable order for SymbolRef-keyed containers" (#121592) Reverts llvm/llvm-project#121551 We had a bunch of build errors caused by this PR. https://lab.llvm.org/buildbot/#/builders/144/builds/14875 --- .../Core/PathSensitive/SymExpr.h | 31 ++---- .../Core/PathSensitive/SymbolManager.h | 100 ++++-------------- .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 25 +++-- clang/test/Analysis/dump_egraph.cpp | 2 +- .../expr-inspection-printState-diseq-info.c | 12 +-- .../expr-inspection-printState-eq-classes.c | 4 +- clang/test/Analysis/ptr-arith.cpp | 4 +- ...symbol-simplification-disequality-info.cpp | 20 ++-- ...-simplification-fixpoint-one-iteration.cpp | 12 +-- ...simplification-fixpoint-two-iterations.cpp | 18 ++-- clang/test/Analysis/unary-sym-expr.c | 6 +- 11 files changed, 85 insertions(+), 149 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h index aca14cf813c4b..862a30c0e7363 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h @@ -25,8 +25,6 @@ namespace ento { class MemRegion; -using SymbolID = unsigned; - /// Symbolic value. These values used to capture symbolic execution of /// the program. class SymExpr : public llvm::FoldingSetNode { @@ -41,19 +39,9 @@ class SymExpr : public llvm::FoldingSetNode { private: Kind K; - /// A unique identifier for this symbol. - /// - /// It is useful for SymbolData to easily differentiate multiple symbols, but - /// also for "ephemeral" symbols, such as binary operations, because this id - /// can be used for arranging constraints or equivalence classes instead of - /// unstable pointer values. - /// - /// Note, however, that it can't be used in Profile because SymbolManager - /// needs to compute Profile before allocating SymExpr. - const SymbolID Sym; protected: - SymExpr(Kind k, SymbolID Sym) : K(k), Sym(Sym) {} + SymExpr(Kind k) : K(k) {} static bool isValidTypeForSymbol(QualType T) { // FIXME: Depending on whether we choose to deprecate structural symbols, @@ -68,14 +56,6 @@ class SymExpr : public llvm::FoldingSetNode { Kind getKind() const { return K; } - /// Get a unique identifier for this symbol. - /// The ID is unique across all SymExprs in a SymbolManager. - /// They reflect the allocation order of these SymExprs, - /// and are likely stable across runs. - /// Used as a key in SymbolRef containers and as part of identity - /// for SymbolData, e.g. SymbolConjured with ID = 7 is "conj_$7". - SymbolID getSymbolID() const { return Sym; } - virtual void dump() const; virtual void dumpToStream(raw_ostream &os) const {} @@ -132,14 +112,19 @@ inline raw_ostream &operator<<(raw_ostream &os, using SymbolRef = const SymExpr *; using SymbolRefSmallVectorTy = SmallVector; +using SymbolID = unsigned; /// A symbol representing data which can be stored in a memory location /// (region). class SymbolData : public SymExpr { + const SymbolID Sym; + void anchor() override; protected: - SymbolData(Kind k, SymbolID sym) : SymExpr(k, sym) { assert(classof(this)); } + SymbolData(Kind k, SymbolID sym) : SymExpr(k), Sym(sym) { + assert(classof(this)); + } public: ~SymbolData() override = default; @@ -147,6 +132,8 @@ class SymbolData : public SymExpr { /// Get a string representation of the kind of the region. virtual StringRef getKindStr() const = 0; + SymbolID getSymbolID() const { return Sym; } + unsigned computeComplexity() const override { return 1; }; diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h index b57f415ec139f..73732d532f630 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h @@ -25,7 +25,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" -#include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" #include @@ -44,16 +43,15 @@ class StoreManager; class SymbolRegionValue : public SymbolData { const TypedValueRegion *R; - friend class SymExprAllocator; +public: SymbolRegionValue(SymbolID sym, const TypedValueRegion *r) : SymbolData(SymbolRegionValueKind, sym), R(r) { assert(r); assert(isValidTypeForSymbol(r->getValueType())); } -public: LLVM_ATTRIBUTE_RETURNS_NONNULL - const TypedValueRegion *getRegion() const { return R; } + const TypedValueRegion* getRegion() const { return R; } static void Profile(llvm::FoldingSetNodeID& profile, const TypedValueRegion* R) { profile.AddInteger((unsigned) SymbolRegionValueKind); @@ -86,7 +84,7 @@ class SymbolConjured : public SymbolData { const LocationContext *LCtx; const void *SymbolTag; - friend class SymExprAllocator; +public: SymbolConjured(SymbolID sym, const Stmt *s, const LocationContext *lctx, QualType t, unsigned count, const void *symbolTag) : SymbolData(SymbolConjuredKind, sym), S(s), T(t), Count(count), @@ -100,7 +98,6 @@ class SymbolConjured : public SymbolData { assert(isValidTypeForSymbol(t)); } -public: /// It might return null. const Stmt *getStmt() const { return S; } unsigned getCount() const { return Count; } @@ -140,7 +137,7 @@ class SymbolDerived : public SymbolData { SymbolRef parentSymbol; const TypedValueRegion *R; - friend class SymExprAllocator; +public: SymbolDerived(SymbolID sym, SymbolRef parent, const TypedValueRegion *r) : SymbolData(SymbolDerivedKind, sym), parentSymbol(parent), R(r) { assert(parent); @@ -148,7 +145,6 @@ class SymbolDerived : public SymbolData { assert(isValidTypeForSymbol(r->getValueType())); } -public: LLVM_ATTRIBUTE_RETURNS_NONNULL SymbolRef getParentSymbol() const { return parentSymbol; } LLVM_ATTRIBUTE_RETURNS_NONNULL @@ -184,13 +180,12 @@ class SymbolDerived : public SymbolData { class SymbolExtent : public SymbolData { const SubRegion *R; - friend class SymExprAllocator; +public: SymbolExtent(SymbolID sym, const SubRegion *r) : SymbolData(SymbolExtentKind, sym), R(r) { assert(r); } -public: LLVM_ATTRIBUTE_RETURNS_NONNULL const SubRegion *getRegion() const { return R; } @@ -227,7 +222,7 @@ class SymbolMetadata : public SymbolData { unsigned Count; const void *Tag; - friend class SymExprAllocator; +public: SymbolMetadata(SymbolID sym, const MemRegion* r, const Stmt *s, QualType t, const LocationContext *LCtx, unsigned count, const void *tag) : SymbolData(SymbolMetadataKind, sym), R(r), S(s), T(t), LCtx(LCtx), @@ -239,7 +234,6 @@ class SymbolMetadata : public SymbolData { assert(tag); } - public: LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion *getRegion() const { return R; } @@ -292,16 +286,15 @@ class SymbolCast : public SymExpr { /// The type of the result. QualType ToTy; - friend class SymExprAllocator; - SymbolCast(SymbolID Sym, const SymExpr *In, QualType From, QualType To) - : SymExpr(SymbolCastKind, Sym), Operand(In), FromTy(From), ToTy(To) { +public: + SymbolCast(const SymExpr *In, QualType From, QualType To) + : SymExpr(SymbolCastKind), Operand(In), FromTy(From), ToTy(To) { assert(In); assert(isValidTypeForSymbol(From)); // FIXME: GenericTaintChecker creates symbols of void type. // Otherwise, 'To' should also be a valid type. } -public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -339,10 +332,9 @@ class UnarySymExpr : public SymExpr { UnaryOperator::Opcode Op; QualType T; - friend class SymExprAllocator; - UnarySymExpr(SymbolID Sym, const SymExpr *In, UnaryOperator::Opcode Op, - QualType T) - : SymExpr(UnarySymExprKind, Sym), Operand(In), Op(Op), T(T) { +public: + UnarySymExpr(const SymExpr *In, UnaryOperator::Opcode Op, QualType T) + : SymExpr(UnarySymExprKind), Operand(In), Op(Op), T(T) { // Note, some unary operators are modeled as a binary operator. E.g. ++x is // modeled as x + 1. assert((Op == UO_Minus || Op == UO_Not) && "non-supported unary expression"); @@ -353,7 +345,6 @@ class UnarySymExpr : public SymExpr { assert(!Loc::isLocType(T) && "unary symbol should be nonloc"); } -public: unsigned computeComplexity() const override { if (Complexity == 0) Complexity = 1 + Operand->computeComplexity(); @@ -390,8 +381,8 @@ class BinarySymExpr : public SymExpr { QualType T; protected: - BinarySymExpr(SymbolID Sym, Kind k, BinaryOperator::Opcode op, QualType t) - : SymExpr(k, Sym), Op(op), T(t) { + BinarySymExpr(Kind k, BinaryOperator::Opcode op, QualType t) + : SymExpr(k), Op(op), T(t) { assert(classof(this)); // Binary expressions are results of arithmetic. Pointer arithmetic is not // handled by binary expressions, but it is instead handled by applying @@ -434,15 +425,14 @@ class BinarySymExprImpl : public BinarySymExpr { LHSTYPE LHS; RHSTYPE RHS; - friend class SymExprAllocator; - BinarySymExprImpl(SymbolID Sym, LHSTYPE lhs, BinaryOperator::Opcode op, - RHSTYPE rhs, QualType t) - : BinarySymExpr(Sym, ClassKind, op, t), LHS(lhs), RHS(rhs) { +public: + BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs, + QualType t) + : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) { assert(getPointer(lhs)); assert(getPointer(rhs)); } -public: void dumpToStream(raw_ostream &os) const override { dumpToStreamImpl(os, LHS); dumpToStreamImpl(os, getOpcode()); @@ -488,21 +478,6 @@ using IntSymExpr = BinarySymExprImpl; -class SymExprAllocator { - SymbolID NextSymbolID = 0; - llvm::BumpPtrAllocator &Alloc; - -public: - explicit SymExprAllocator(llvm::BumpPtrAllocator &Alloc) : Alloc(Alloc) {} - - template SymT *make(ArgsT &&...Args) { - return new (Alloc) SymT(nextID(), std::forward(Args)...); - } - -private: - SymbolID nextID() { return NextSymbolID++; } -}; - class SymbolManager { using DataSetTy = llvm::FoldingSet; using SymbolDependTy = @@ -514,14 +489,15 @@ class SymbolManager { /// alive as long as the key is live. SymbolDependTy SymbolDependencies; - SymExprAllocator Alloc; + unsigned SymbolCounter = 0; + llvm::BumpPtrAllocator& BPAlloc; BasicValueFactory &BV; ASTContext &Ctx; public: SymbolManager(ASTContext &ctx, BasicValueFactory &bv, - llvm::BumpPtrAllocator &bpalloc) - : SymbolDependencies(16), Alloc(bpalloc), BV(bv), Ctx(ctx) {} + llvm::BumpPtrAllocator& bpalloc) + : SymbolDependencies(16), BPAlloc(bpalloc), BV(bv), Ctx(ctx) {} static bool canSymbolicate(QualType T); @@ -711,36 +687,4 @@ class SymbolVisitor { } // namespace clang -// Override the default definition that would use pointer values of SymbolRefs -// to order them, which is unstable due to ASLR. -// Use the SymbolID instead which reflect the order in which the symbols were -// allocated. This is usually stable across runs leading to the stability of -// ConstraintMap and other containers using SymbolRef as keys. -template <> -struct ::llvm::ImutContainerInfo - : public ImutProfileInfo { - using value_type = clang::ento::SymbolRef; - using value_type_ref = clang::ento::SymbolRef; - using key_type = value_type; - using key_type_ref = value_type_ref; - using data_type = bool; - using data_type_ref = bool; - - static key_type_ref KeyOfValue(value_type_ref D) { return D; } - static data_type_ref DataOfValue(value_type_ref) { return true; } - - static bool isEqual(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { - return LHS->getSymbolID() == RHS->getSymbolID(); - } - - static bool isLess(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) { - return LHS->getSymbolID() < RHS->getSymbolID(); - } - - // This might seem redundant, but it is required because of the way - // ImmutableSet is implemented through AVLTree: - // same as ImmutableMap, but with a non-informative "data". - static bool isDataEqual(data_type_ref, data_type_ref) { return true; } -}; - #endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SYMBOLMANAGER_H diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp index 738b6a175ce6d..f21e5c3ad7bd7 100644 --- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp @@ -170,8 +170,9 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(R); + SD = new (BPAlloc) SymbolRegionValue(SymbolCounter, R); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -187,8 +188,9 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(E, LCtx, T, Count, SymbolTag); + SD = new (BPAlloc) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -202,8 +204,9 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(parentSymbol, R); + SD = new (BPAlloc) SymbolDerived(SymbolCounter, parentSymbol, R); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -216,8 +219,9 @@ SymbolManager::getExtentSymbol(const SubRegion *R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(R); + SD = new (BPAlloc) SymbolExtent(SymbolCounter, R); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -232,8 +236,9 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = Alloc.make(R, S, T, LCtx, Count, SymbolTag); + SD = new (BPAlloc) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); + ++SymbolCounter; } return cast(SD); @@ -247,7 +252,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(Op, From, To); + data = new (BPAlloc) SymbolCast(Op, From, To); DataSet.InsertNode(data, InsertPos); } @@ -263,7 +268,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(lhs, op, v, t); + data = new (BPAlloc) SymIntExpr(lhs, op, v, t); DataSet.InsertNode(data, InsertPos); } @@ -279,7 +284,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(lhs, op, rhs, t); + data = new (BPAlloc) IntSymExpr(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -296,7 +301,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(lhs, op, rhs, t); + data = new (BPAlloc) SymSymExpr(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -311,7 +316,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = Alloc.make(Operand, Opc, T); + data = new (BPAlloc) UnarySymExpr(Operand, Opc, T); DataSet.InsertNode(data, InsertPos); } diff --git a/clang/test/Analysis/dump_egraph.cpp b/clang/test/Analysis/dump_egraph.cpp index 13459699a06f6..d1229b2634674 100644 --- a/clang/test/Analysis/dump_egraph.cpp +++ b/clang/test/Analysis/dump_egraph.cpp @@ -21,7 +21,7 @@ void foo() { // CHECK: \"location_context\": \"#0 Call\", \"calling\": \"T::T\", \"location\": \{ \"line\": 15, \"column\": 5, \"file\": \"{{.*}}dump_egraph.cpp\" \}, \"items\": [\l        \{ \"init_id\": {{[0-9]+}}, \"kind\": \"construct into member variable\", \"argument_index\": null, \"pretty\": \"s\", \"value\": \"&t.s\" -// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$3\{int, LC5, no stmt, #1\}\" +// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l        \{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$2\{int, LC5, no stmt, #1\}\" // CHECK: \"dynamic_types\": [\l      \{ \"region\": \"HeapSymRegion\{conj_$1\{S *, LC1, S{{[0-9]+}}, #1\}\}\", \"dyn_type\": \"S\", \"sub_classable\": false \}\l diff --git a/clang/test/Analysis/expr-inspection-printState-diseq-info.c b/clang/test/Analysis/expr-inspection-printState-diseq-info.c index 515fcbbd43079..c5c31785a600e 100644 --- a/clang/test/Analysis/expr-inspection-printState-diseq-info.c +++ b/clang/test/Analysis/expr-inspection-printState-diseq-info.c @@ -18,17 +18,17 @@ void test_disequality_info(int e0, int b0, int b1, int c0) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "(reg_$0) - 2" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$7" ]] + // CHECK-NEXT: [ "reg_$2" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$15" ], + // CHECK-NEXT: "class": [ "reg_$2" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$7" ]] + // CHECK-NEXT: [ "(reg_$0) - 2" ], + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$7" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) - 2" ], - // CHECK-NEXT: [ "reg_$15" ]] + // CHECK-NEXT: [ "reg_$2" ]] // CHECK-NEXT: } // CHECK-NEXT: ], diff --git a/clang/test/Analysis/expr-inspection-printState-eq-classes.c b/clang/test/Analysis/expr-inspection-printState-eq-classes.c index 19cc13735ab5a..38e23d6e83826 100644 --- a/clang/test/Analysis/expr-inspection-printState-eq-classes.c +++ b/clang/test/Analysis/expr-inspection-printState-eq-classes.c @@ -16,6 +16,6 @@ void test_equivalence_classes(int a, int b, int c, int d) { } // CHECK: "equivalence_classes": [ -// CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], -// CHECK-NEXT: [ "reg_$0", "reg_$20", "reg_$5" ] +// CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], +// CHECK-NEXT: [ "reg_$0", "reg_$2", "reg_$3" ] // CHECK-NEXT: ], diff --git a/clang/test/Analysis/ptr-arith.cpp b/clang/test/Analysis/ptr-arith.cpp index ec1c75c0c4063..a1264a1f04839 100644 --- a/clang/test/Analysis/ptr-arith.cpp +++ b/clang/test/Analysis/ptr-arith.cpp @@ -139,10 +139,10 @@ struct parse_t { int parse(parse_t *p) { unsigned copy = p->bits2; clang_analyzer_dump(copy); - // expected-warning@-1 {{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>}} + // expected-warning@-1 {{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>}} header *bits = (header *)© clang_analyzer_dump(bits->b); - // expected-warning@-1 {{derived_$4{reg_$2},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} + // expected-warning@-1 {{derived_$2{reg_$1},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}} return bits->b; // no-warning } } // namespace Bug_55934 diff --git a/clang/test/Analysis/symbol-simplification-disequality-info.cpp b/clang/test/Analysis/symbol-simplification-disequality-info.cpp index 33b8f150f5d02..69238b583eb84 100644 --- a/clang/test/Analysis/symbol-simplification-disequality-info.cpp +++ b/clang/test/Analysis/symbol-simplification-disequality-info.cpp @@ -14,14 +14,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$2)) + (reg_$5)" ], + // CHECK-NEXT: "class": [ "((reg_$0) + (reg_$1)) + (reg_$2)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$8" ]] + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$8" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)" ]] + // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -32,14 +32,14 @@ void test(int a, int b, int c, int d) { clang_analyzer_printState(); // CHECK: "disequality_info": [ // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$5)" ], + // CHECK-NEXT: "class": [ "(reg_$0) + (reg_$2)" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$8" ]] + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$8" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$5)" ]] + // CHECK-NEXT: [ "(reg_$0) + (reg_$2)" ]] // CHECK-NEXT: } // CHECK-NEXT: ], @@ -50,10 +50,10 @@ void test(int a, int b, int c, int d) { // CHECK-NEXT: { // CHECK-NEXT: "class": [ "reg_$0" ], // CHECK-NEXT: "disequal_to": [ - // CHECK-NEXT: [ "reg_$8" ]] + // CHECK-NEXT: [ "reg_$3" ]] // CHECK-NEXT: }, // CHECK-NEXT: { - // CHECK-NEXT: "class": [ "reg_$8" ], + // CHECK-NEXT: "class": [ "reg_$3" ], // CHECK-NEXT: "disequal_to": [ // CHECK-NEXT: [ "reg_$0" ]] // CHECK-NEXT: } diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp index 42e984762538e..73922d420a8c3 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp @@ -13,10 +13,10 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$2)) != (reg_$5)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "((reg_$0) + (reg_$1)) != (reg_$2)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) + (reg_$2)", "reg_$5" ] + // CHECK-NEXT: [ "(reg_$0) + (reg_$1)", "reg_$2" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -25,12 +25,12 @@ void test(int a, int b, int c) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$5)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$2)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$5)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$5" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$2)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$2" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp index cffb5a70869eb..679ed3fda7a7a 100644 --- a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp +++ b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp @@ -15,11 +15,11 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$2)) + (reg_$5)) != (reg_$8)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "(reg_$5) + (reg_$2)", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(((reg_$0) + (reg_$1)) + (reg_$2)) != (reg_$3)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "(reg_$2) + (reg_$1)", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "((reg_$0) + (reg_$2)) + (reg_$5)", "reg_$8" ] + // CHECK-NEXT: [ "((reg_$0) + (reg_$1)) + (reg_$2)", "reg_$3" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, @@ -28,14 +28,14 @@ void test(int a, int b, int c, int d) { return; clang_analyzer_printState(); // CHECK: "constraints": [ - // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$8)", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" }, - // CHECK-NEXT: { "symbol": "reg_$5", "range": "{ [0, 0] }" } + // CHECK-NEXT: { "symbol": "(reg_$0) != (reg_$3)", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$1", "range": "{ [0, 0] }" }, + // CHECK-NEXT: { "symbol": "reg_$2", "range": "{ [0, 0] }" } // CHECK-NEXT: ], // CHECK-NEXT: "equivalence_classes": [ - // CHECK-NEXT: [ "(reg_$0) != (reg_$8)" ], - // CHECK-NEXT: [ "reg_$0", "reg_$8" ], - // CHECK-NEXT: [ "reg_$5" ] + // CHECK-NEXT: [ "(reg_$0) != (reg_$3)" ], + // CHECK-NEXT: [ "reg_$0", "reg_$3" ], + // CHECK-NEXT: [ "reg_$2" ] // CHECK-NEXT: ], // CHECK-NEXT: "disequality_info": null, diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c index 64a01a956c442..92e11b295bee7 100644 --- a/clang/test/Analysis/unary-sym-expr.c +++ b/clang/test/Analysis/unary-sym-expr.c @@ -11,9 +11,9 @@ int test(int x, int y) { clang_analyzer_dump(-x); // expected-warning{{-reg_$0}} clang_analyzer_dump(~x); // expected-warning{{~reg_$0}} int z = x + y; - clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$3))}} - clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$3))}} - clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$3)}} + clang_analyzer_dump(-z); // expected-warning{{-((reg_$0) + (reg_$1))}} + clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0) + (reg_$1))}} + clang_analyzer_dump(-x + y); // expected-warning{{(-reg_$0) + (reg_$1)}} if (-x == 0) { clang_analyzer_eval(-x == 0); // expected-warning{{TRUE}} From e32afded9227635108fad003e5c6d3bd88e5e3c1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 10:46:37 -0800 Subject: [PATCH 111/480] [LegalizeVectorOps] Use getBoolConstant instead of getAllOnesConstant in VectorLegalizer::UnrollVSETCC. (#121526) This code should follow the target preference for boolean contents of a vector type. We shouldn't assume that true is negative one. --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index db21e70897064..39903bde25a62 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2246,11 +2246,13 @@ SDValue VectorLegalizer::UnrollVSETCC(SDNode *Node) { DAG.getVectorIdxConstant(i, dl)); SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS, DAG.getVectorIdxConstant(i, dl)); + // FIXME: We should use i1 setcc + boolext here, but it causes regressions. Ops[i] = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TmpEltVT), LHSElem, RHSElem, CC); - Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], DAG.getAllOnesConstant(dl, EltVT), + Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], + DAG.getBoolConstant(true, dl, EltVT, VT), DAG.getConstant(0, dl, EltVT)); } return DAG.getBuildVector(VT, dl, Ops); From 34d2c3b9349b151bd69defa4880ecf56fb017287 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 14:11:25 -0500 Subject: [PATCH 112/480] [AMDGPU][True16][MC] true16 for v_sin_f16 (#120692) Support true16 format for v_sin_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 32 ++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 24 ++- 29 files changed, 1062 insertions(+), 462 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 30911d45c9e97..badca264e8f92 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1045,7 +1045,7 @@ defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16 defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; -defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; +defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 2bb89fdabda7e..6927636ad04aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: @@ -80,6 +81,19 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_sin_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sin_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.sin.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -188,6 +202,24 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sin_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_sin_f16_e32 v2, v2 +; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 5ceb8ed0065d3..9b9837b46b26d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -3305,50 +3305,65 @@ v_sat_pk_u8_i16 v5.h, src_scc v_sat_pk_u8_i16 v127.h, 0xfe0b // GFX11: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] -v_sin_f16 v5, v1 -// GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v1.l +// GFX11: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e] -v_sin_f16 v5, v127 -// GFX11: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v127.l +// GFX11: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e] -v_sin_f16 v5, s1 -// GFX11: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s1 +// GFX11: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] -v_sin_f16 v5, s105 -// GFX11: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s105 +// GFX11: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_lo -// GFX11: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_lo +// GFX11: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_hi -// GFX11: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_hi +// GFX11: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] -v_sin_f16 v5, ttmp15 -// GFX11: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, ttmp15 +// GFX11: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] -v_sin_f16 v5, m0 -// GFX11: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, m0 +// GFX11: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_lo -// GFX11: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_lo +// GFX11: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_hi -// GFX11: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_hi +// GFX11: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] -v_sin_f16 v5, null -// GFX11: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, null +// GFX11: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e] -v_sin_f16 v5, -1 -// GFX11: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, -1 +// GFX11: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] -v_sin_f16 v5, 0.5 -// GFX11: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, 0.5 +// GFX11: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] -v_sin_f16 v5, src_scc -// GFX11: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, src_scc +// GFX11: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] -v_sin_f16 v127, 0xfe0b -// GFX11: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sin_f16 v127.l, 0xfe0b +// GFX11: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sin_f16 v5.l, v1.h +// GFX11: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e] + +v_sin_f16 v5.l, v127.h +// GFX11: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e] + +v_sin_f16 v127.l, 0.5 +// GFX11: v_sin_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc0,0xfe,0x7e] + +v_sin_f16 v5.h, src_scc +// GFX11: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f] + +v_sin_f16 v127.h, 0xfe0b +// GFX11: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f32 v5, v1 // GFX11: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 4d1bd99b90252..b080bd9fca461 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -2582,47 +2582,56 @@ v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi: v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_sin_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_sin_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_sin_f16 v5, v1 row_mirror -// GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_sin_f16 v5.l, v1.l row_mirror +// GFX11: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_sin_f16 v5, v1 row_half_mirror -// GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_sin_f16 v5.l, v1.l row_half_mirror +// GFX11: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_sin_f16 v5, v1 row_shl:1 -// GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_sin_f16 v5, v1 row_shl:15 -// GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:15 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_sin_f16 v5, v1 row_shr:1 -// GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_sin_f16 v5, v1 row_shr:15 -// GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:15 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_sin_f16 v5, v1 row_ror:1 -// GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_sin_f16 v5, v1 row_ror:15 -// GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:15 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_sin_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_sin_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 2799ea7b8ef8b..6a47dce49ed2a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -614,14 +614,23 @@ v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_sin_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index caa73b7b9f047..34f10c98e1468 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -779,6 +779,12 @@ v_sat_pk_u8_i16_e32 v199.l, v5.l quad_perm:[3,2,1,0] v_sin_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_sin_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -788,6 +794,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -797,6 +821,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sqrt_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 0dd1bf6142189..9e424fbd004e4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1886,71 +1886,137 @@ v_sat_pk_u8_i16 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16 v199.l, v5 quad_perm:[3,2,1,0] // GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] -v_sin_f16 v128, 0xfe0b -// GFX11: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sin_f16 v128.h, 0xfe0b +// GFX11: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, -1 -// GFX11: v_sin_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16 v128.l, 0xfe0b +// GFX11: v_sin_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, 0.5 -// GFX11: v_sin_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] +v_sin_f16 v255.h, -1 +// GFX11: v_sin_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v255, exec_hi -// GFX11: v_sin_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16 v255.h, 0.5 +// GFX11: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00] -v_sin_f16 v255, exec_lo -// GFX11: v_sin_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_hi +// GFX11: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16 v255, m0 -// GFX11: v_sin_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_lo +// GFX11: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16 v255, null -// GFX11: v_sin_f16_e64 v255, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16 v255.h, m0 +// GFX11: v_sin_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16 v255, s1 -// GFX11: v_sin_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16 v255.h, null +// GFX11: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16 v255, s105 -// GFX11: v_sin_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16 v255.h, s1 +// GFX11: v_sin_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16 v255, src_scc -// GFX11: v_sin_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] +v_sin_f16 v255.h, s105 +// GFX11: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16 v255, ttmp15 -// GFX11: v_sin_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16 v255.h, src_scc +// GFX11: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00] -v_sin_f16 v255, v1 -// GFX11: v_sin_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16 v255.h, ttmp15 +// GFX11: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16 v255.h, v1.h +// GFX11: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16 v255, v127 -// GFX11: v_sin_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] +v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_sin_f16 v255.h, v127.h +// GFX11: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00] -v_sin_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_sin_f16 v255, vcc_hi -// GFX11: v_sin_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_sin_f16 v255, vcc_lo -// GFX11: v_sin_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16 v255.h, vcc_hi +// GFX11: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16 v5, v199 -// GFX11: v_sin_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] +v_sin_f16 v255.h, vcc_lo +// GFX11: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_sin_f16 v255.l, -1 +// GFX11: v_sin_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_sin_f16 v255.l, 0.5 +// GFX11: v_sin_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_hi +// GFX11: v_sin_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_lo +// GFX11: v_sin_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_f16 v255.l, m0 +// GFX11: v_sin_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_f16 v255.l, null +// GFX11: v_sin_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_f16 v255.l, s1 +// GFX11: v_sin_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] + +v_sin_f16 v255.l, s105 +// GFX11: v_sin_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] + +v_sin_f16 v255.l, src_scc +// GFX11: v_sin_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] + +v_sin_f16 v255.l, ttmp15 +// GFX11: v_sin_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_f16 v255.l, v1.l +// GFX11: v_sin_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_sin_f16 v255.l, v127.l +// GFX11: v_sin_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] + +v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_sin_f16 v255.l, vcc_hi +// GFX11: v_sin_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_f16 v255.l, vcc_lo +// GFX11: v_sin_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_f16 v5.h, v199.h +// GFX11: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_sin_f16 v5.l, v199.l +// GFX11: v_sin_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_sqrt_f16 v128, 0xfe0b // GFX11: v_sqrt_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 8de72e74c2856..3992b869c46d5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -2698,47 +2698,56 @@ v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bou v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 182a13831ec6d..a123c73c73bcb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -781,17 +781,26 @@ v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 17678e3bd9f08..4b055165871cf 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -3247,50 +3247,59 @@ v_sat_pk_u8_i16_e64 v255.l, 0xfe0b v_sat_pk_u8_i16_e64 v255.h, 0xfe0b // GFX11: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16_e64 v5, v1 -// GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v1.l +// GFX11: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16_e64 v5, v255 -// GFX11: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v255.l +// GFX11: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] -v_sin_f16_e64 v5, s1 -// GFX11: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s1 +// GFX11: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16_e64 v5, s105 -// GFX11: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s105 +// GFX11: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_lo -// GFX11: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_lo +// GFX11: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_hi -// GFX11: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_hi +// GFX11: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16_e64 v5, ttmp15 -// GFX11: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, ttmp15 +// GFX11: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16_e64 v5, m0 -// GFX11: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, m0 +// GFX11: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_lo -// GFX11: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_lo +// GFX11: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_hi -// GFX11: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_hi +// GFX11: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16_e64 v5, null -// GFX11: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, null +// GFX11: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16_e64 v5, -1 -// GFX11: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, -1 +// GFX11: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16_e64 v5, 0.5 mul:2 -// GFX11: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +v_sin_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] -v_sin_f16_e64 v5, src_scc mul:4 -// GFX11: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +v_sin_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] -v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sin_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] + +v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_sin_f32_e64 v5, v1 // GFX11: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 4f82643fd4886..ed90e480012c0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -3378,50 +3378,62 @@ v_sat_pk_u8_i16 v5.h, src_scc v_sat_pk_u8_i16 v127.h, 0xfe0b // GFX12: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] -v_sin_f16 v5, v1 -// GFX12: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v1.l +// GFX12: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e] -v_sin_f16 v5, v127 -// GFX12: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] +v_sin_f16 v5.l, v127.l +// GFX12: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e] -v_sin_f16 v5, s1 -// GFX12: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s1 +// GFX12: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] -v_sin_f16 v5, s105 -// GFX12: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, s105 +// GFX12: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_lo -// GFX12: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_lo +// GFX12: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] -v_sin_f16 v5, vcc_hi -// GFX12: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, vcc_hi +// GFX12: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] -v_sin_f16 v5, ttmp15 -// GFX12: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, ttmp15 +// GFX12: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] -v_sin_f16 v5, m0 -// GFX12: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, m0 +// GFX12: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_lo -// GFX12: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_lo +// GFX12: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] -v_sin_f16 v5, exec_hi -// GFX12: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, exec_hi +// GFX12: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] -v_sin_f16 v5, null -// GFX12: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, null +// GFX12: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e] -v_sin_f16 v5, -1 -// GFX12: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, -1 +// GFX12: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] -v_sin_f16 v5, 0.5 -// GFX12: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, 0.5 +// GFX12: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] -v_sin_f16 v5, src_scc -// GFX12: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +v_sin_f16 v5.l, src_scc +// GFX12: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] -v_sin_f16 v127, 0xfe0b -// GFX12: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sin_f16 v127.l, 0xfe0b +// GFX12: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_sin_f16 v5.l, v1.h +// GFX12: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e] + +v_sin_f16 v5.l, v127.h +// GFX12: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e] + +v_sin_f16 v5.h, src_scc +// GFX12: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f] + +v_sin_f16 v127.h, 0xfe0b +// GFX12: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_sin_f32 v5, v1 // GFX12: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 2b3a52cf4e804..90968055e2a82 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -2644,47 +2644,53 @@ v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi: v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_sin_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_sin_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_sin_f16 v5, v1 row_mirror -// GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_sin_f16 v5.l, v1.l row_mirror +// GFX12: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_sin_f16 v5, v1 row_half_mirror -// GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_sin_f16 v5.l, v1.l row_half_mirror +// GFX12: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_sin_f16 v5, v1 row_shl:1 -// GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_sin_f16 v5, v1 row_shl:15 -// GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shl:15 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_sin_f16 v5, v1 row_shr:1 -// GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_sin_f16 v5, v1 row_shr:15 -// GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_shr:15 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_sin_f16 v5, v1 row_ror:1 -// GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_sin_f16 v5, v1 row_ror:15 -// GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_sin_f16 v5.l, v1.l row_ror:15 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_sin_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 977d5b08b80ee..0ce0087918f56 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -631,14 +631,20 @@ v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 1b6734a6a652b..92a0d15bbc6f0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -689,6 +689,12 @@ v_sat_pk_u8_i16_e32 v199.l, v5 quad_perm:[3,2,1,0] v_sin_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_sin_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -698,6 +704,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_sin_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -707,6 +731,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_sin_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_sqrt_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 9d36ea0b9f479..bbe7b65d03281 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1846,71 +1846,137 @@ v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0] v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0] // GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff] -v_sin_f16 v128, 0xfe0b -// GFX12: v_sin_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_sin_f16 v128.h, 0xfe0b +// GFX12: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, -1 -// GFX12: v_sin_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16 v128.l, 0xfe0b +// GFX12: v_sin_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16 v255, 0.5 -// GFX12: v_sin_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] +v_sin_f16 v255.h, -1 +// GFX12: v_sin_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v255, exec_hi -// GFX12: v_sin_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16 v255.h, 0.5 +// GFX12: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00] -v_sin_f16 v255, exec_lo -// GFX12: v_sin_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_hi +// GFX12: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16 v255, m0 -// GFX12: v_sin_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16 v255.h, exec_lo +// GFX12: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16 v255, null -// GFX12: v_sin_f16_e64 v255, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16 v255.h, m0 +// GFX12: v_sin_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16 v255, s1 -// GFX12: v_sin_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16 v255.h, null +// GFX12: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16 v255, s105 -// GFX12: v_sin_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16 v255.h, s1 +// GFX12: v_sin_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16 v255, src_scc -// GFX12: v_sin_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] +v_sin_f16 v255.h, s105 +// GFX12: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16 v255, ttmp15 -// GFX12: v_sin_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16 v255.h, src_scc +// GFX12: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00] -v_sin_f16 v255, v1 -// GFX12: v_sin_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16 v255.h, ttmp15 +// GFX12: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16 v255.h, v1.h +// GFX12: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16 v255, v127 -// GFX12: v_sin_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] +v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_sin_f16 v255.h, v127.h +// GFX12: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00] -v_sin_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_sin_f16 v255, vcc_hi -// GFX12: v_sin_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_sin_f16 v255, vcc_lo -// GFX12: v_sin_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16 v255.h, vcc_hi +// GFX12: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16 v5, v199 -// GFX12: v_sin_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] +v_sin_f16 v255.h, vcc_lo +// GFX12: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_sin_f16 v255.l, -1 +// GFX12: v_sin_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_sin_f16 v255.l, 0.5 +// GFX12: v_sin_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_hi +// GFX12: v_sin_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_f16 v255.l, exec_lo +// GFX12: v_sin_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_f16 v255.l, m0 +// GFX12: v_sin_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_f16 v255.l, null +// GFX12: v_sin_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_f16 v255.l, s1 +// GFX12: v_sin_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] + +v_sin_f16 v255.l, s105 +// GFX12: v_sin_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] + +v_sin_f16 v255.l, src_scc +// GFX12: v_sin_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00] + +v_sin_f16 v255.l, ttmp15 +// GFX12: v_sin_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_f16 v255.l, v1.l +// GFX12: v_sin_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_sin_f16 v255.l, v127.l +// GFX12: v_sin_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00] + +v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_sin_f16 v255.l, vcc_hi +// GFX12: v_sin_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_f16 v255.l, vcc_lo +// GFX12: v_sin_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_f16 v5.h, v199.h +// GFX12: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_sin_f16 v5.l, v199.l +// GFX12: v_sin_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00] + +v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_sqrt_f16 v128, 0xfe0b // GFX12: v_sqrt_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 71c12a1333ebc..5af15f2eb971f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -3397,50 +3397,59 @@ v_sat_pk_u8_i16_e64 v255, 0xfe0b v_sat_pk_u8_i16_e64 v255.h, 0xfe0b // GFX12: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_sin_f16_e64 v5, v1 -// GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v1.l +// GFX12: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_sin_f16_e64 v5, v255 -// GFX12: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +v_sin_f16_e64 v5.l, v255.l +// GFX12: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] -v_sin_f16_e64 v5, s1 -// GFX12: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s1 +// GFX12: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_sin_f16_e64 v5, s105 -// GFX12: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, s105 +// GFX12: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_lo -// GFX12: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_lo +// GFX12: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_sin_f16_e64 v5, vcc_hi -// GFX12: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, vcc_hi +// GFX12: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_sin_f16_e64 v5, ttmp15 -// GFX12: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, ttmp15 +// GFX12: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_sin_f16_e64 v5, m0 -// GFX12: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, m0 +// GFX12: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_lo -// GFX12: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_lo +// GFX12: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_sin_f16_e64 v5, exec_hi -// GFX12: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, exec_hi +// GFX12: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_sin_f16_e64 v5, null -// GFX12: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, null +// GFX12: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_sin_f16_e64 v5, -1 -// GFX12: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16_e64 v5.l, -1 +// GFX12: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_sin_f16_e64 v5, 0.5 mul:2 -// GFX12: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +v_sin_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] -v_sin_f16_e64 v5, src_scc mul:4 -// GFX12: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +v_sin_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] -v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sin_f16_e64 v5.h, v1.h +// GFX12: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16_e64 v5.l, v255.h +// GFX12: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] + +v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_sin_f32_e64 v5, v1 // GFX12: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 42166032124a3..39638cefd44ad 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -2560,47 +2560,56 @@ v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_sin_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index d65d2004fc1e7..a6cef6f134b0a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -742,17 +742,26 @@ v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 38c573a19ba00..0abced9f2f77b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -3323,49 +3323,82 @@ # GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0xc1,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e] 0x7f,0xc1,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e] 0x01,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e] 0x69,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e] 0x6a,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e] 0x6b,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e] 0x7b,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e] 0x7d,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e] 0x7e,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e] 0x7f,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e] 0x7c,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e] 0xc1,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e] 0xf0,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e] 0xfd,0xc0,0x0a,0x7e -# GFX11: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +# GFX11-REAL16: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e] 0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xc1,0x0a,0x7e +# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc1,0x0a,0x7e] + +0xff,0xc1,0x0a,0x7e +# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc1,0x0a,0x7e] + +0xf0,0xc0,0xfe,0x7e +# GFX11-REAL16: v_sin_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc0,0xfe,0x7e] +# GFX11-FAKE16: v_sin_f16_e32 v127, 0.5 ; encoding: [0xf0,0xc0,0xfe,0x7e] + +0xfd,0xc0,0x0a,0x7f +# GFX11-REAL16: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f] + +0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x6b,0x0a,0x7e # GFX11: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index b801e393c635d..7043f3b2b9f29 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -2619,46 +2619,72 @@ # GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index faf3c6f628b95..d2eb919849fd3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -502,10 +502,23 @@ # GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index f689c43b75365..5c3fde7b80556 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -2773,46 +2773,72 @@ # GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 48824399a0887..28b39f4b0344a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -753,16 +753,32 @@ # GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 04c9094465b3b..d078bc2b8cb04 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -3295,49 +3295,76 @@ # GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index b93a6252beaeb..46dedd970a320 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -2747,46 +2747,68 @@ # GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183 ; encoding: [0xff,0x6f,0x0d,0x30] 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 092ba9b88f951..551dab7ec3e7c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -508,10 +508,19 @@ # GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 7fdb9e0ac6977..0d01be721e60d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -3341,49 +3341,76 @@ # GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index ad491dc02d384..d501d62c006eb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -2617,46 +2617,72 @@ # GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 21b4d0572bf37..aba7d3ff43d8b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -717,16 +717,32 @@ # GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 5ee8418057646f4640cd1bb60e73f9e5129ea12e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Jan 2025 11:19:22 -0800 Subject: [PATCH 113/480] [Docs][TableGen] Remove ReturnRange from the SearchIndex documentation. NFC SearchIndex doesn't support ReturnRange. It is only supported for the primary key. --- llvm/docs/TableGen/BackEnds.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst index f73269e717184..94af2e4ab8f5c 100644 --- a/llvm/docs/TableGen/BackEnds.rst +++ b/llvm/docs/TableGen/BackEnds.rst @@ -1071,8 +1071,6 @@ function. This class provides three fields. * ``bit EarlyOut``. See the third example in `Generic Tables`_. -* ``bit ReturnRange``. See the second example in `Generic Tables`_. - Here is an example of a secondary key added to the ``CTable`` above. The generated function looks up entries based on the ``Name`` and ``Kind`` fields. From 432a871ba8f6a62272a7ef1162305328b0de7802 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 3 Jan 2025 11:23:35 -0800 Subject: [PATCH 114/480] Deprecate order file instrumentation (#121514) --- clang/include/clang/Driver/Options.td | 2 +- clang/lib/Driver/ToolChains/Clang.cpp | 22 +++++++++++++--------- clang/test/Driver/clang_f_opts.c | 2 ++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 523761f5e0d80..12edfbb171d34 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1890,7 +1890,7 @@ defm pseudo_probe_for_profiling : BoolFOption<"pseudo-probe-for-profiling", " pseudo probes for sample profiling">>; def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">, Group, Visibility<[ClangOption, CC1Option, CLOption]>, - HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">; + HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var). Deprecated, please use temporal profiling.">; def fprofile_list_EQ : Joined<["-"], "fprofile-list=">, Group, Visibility<[ClangOption, CC1Option, CLOption]>, HelpText<"Filename defining the list of functions/files to instrument. " diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a020e00cd1739..daf863c78d303 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8010,15 +8010,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } - if (Args.hasArg(options::OPT_forder_file_instrumentation)) { - CmdArgs.push_back("-forder-file-instrumentation"); - // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is - // on, we need to pass these flags as linker flags and that will be handled - // outside of the compiler. - if (!IsUsingLTO) { - CmdArgs.push_back("-mllvm"); - CmdArgs.push_back("-enable-order-file-instrumentation"); - } + if (const Arg *A = + Args.getLastArg(options::OPT_forder_file_instrumentation)) { + D.Diag(diag::warn_drv_deprecated_arg) + << A->getAsString(Args) << /*hasReplacement=*/true + << "-mllvm -pgo-temporal-instrumentation"; + CmdArgs.push_back("-forder-file-instrumentation"); + // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is + // on, we need to pass these flags as linker flags and that will be handled + // outside of the compiler. + if (!IsUsingLTO) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-enable-order-file-instrumentation"); + } } if (Arg *A = Args.getLastArg(options::OPT_fforce_enable_int128, diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index ddbf1fd951c84..2b72068eae1ee 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -364,6 +364,7 @@ // RUN: -fno-devirtualize-speculatively \ // RUN: -fslp-vectorize-aggressive \ // RUN: -fno-slp-vectorize-aggressive \ +// RUN: -forder-file-instrumentation \ // RUN: %s 2>&1 | FileCheck --check-prefix=CHECK-WARNING %s // CHECK-WARNING-DAG: optimization flag '-finline-limit=1000' is not supported // CHECK-WARNING-DAG: optimization flag '-finline-limit' is not supported @@ -423,6 +424,7 @@ // CHECK-WARNING-DAG: optimization flag '-fno-devirtualize-speculatively' is not supported // CHECK-WARNING-DAG: the flag '-fslp-vectorize-aggressive' has been deprecated and will be ignored // CHECK-WARNING-DAG: the flag '-fno-slp-vectorize-aggressive' has been deprecated and will be ignored +// CHECK-WARNING-DAG: argument '-forder-file-instrumentation' is deprecated, use '-mllvm -pgo-temporal-instrumentation' instead // Test that we mute the warning on these // RUN: %clang -### -finline-limit=1000 -Wno-invalid-command-line-argument \ From 11c6af666b75d03ac67dfdf9ba190587b7efbcd8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Jan 2025 19:28:02 +0000 Subject: [PATCH 115/480] [VPlan] Fix name ExitVPBB -> MiddleVPBB (NFC). ExitVPBB actually refers to the middle block, clarify name. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2f8a85b7cc23..7ef5295bb1276 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7785,16 +7785,16 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); - auto *ExitVPBB = BestVPlan.getMiddleBlock(); + auto *MiddleVPBB = BestVPlan.getMiddleBlock(); // 2.5 When vectorizing the epilogue, fix reduction and induction resume // values from the additional bypass block. if (VectorizingEpilogue) { assert(!ILV.Legal->hasUncountableEarlyExit() && "Epilogue vectorisation not yet supported with early exits"); BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); - for (VPRecipeBase &R : *ExitVPBB) { + for (VPRecipeBase &R : *MiddleVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( - &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock); + &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); } BasicBlock *PH = OrigLoop->getLoopPreheader(); for (const auto &[IVPhi, _] : Legal->getInductionVars()) { @@ -7840,7 +7840,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 4. Adjust branch weight of the branch in the middle block. auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); if (MiddleTerm->isConditional() && hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { // Assume that `Count % VectorTripCount` is equally distributed. From cb2eafe6ac72064529da5219434e351851a2b68f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 3 Jan 2025 19:37:58 +0000 Subject: [PATCH 116/480] [TableGen] Use SmallVectors for preprocessor include stack. NFC. (#121571) This is just a minor cleanup and a small step in the direction of using LLVM containers in preference to STL containers in lib/TableGen. --- llvm/lib/TableGen/TGLexer.cpp | 41 +++++++++++++---------------------- llvm/lib/TableGen/TGLexer.h | 5 ++--- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index eee42511804f5..e23aec6efba59 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -81,8 +81,7 @@ TGLexer::TGLexer(SourceMgr &SM, ArrayRef Macros) : SrcMgr(SM) { TokStart = nullptr; // Pretend that we enter the "top-level" include file. - PrepIncludeStack.push_back( - std::make_unique>()); + PrepIncludeStack.emplace_back(); // Add all macros defined on the command line to the DefinedMacros set. // Check invalid macro names and print fatal error if we find one. @@ -453,8 +452,7 @@ bool TGLexer::LexInclude() { CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = CurBuf.begin(); - PrepIncludeStack.push_back( - std::make_unique>()); + PrepIncludeStack.emplace_back(); return false; } @@ -656,17 +654,13 @@ tgtok::TokKind TGLexer::LexExclaim() { bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { // Report an error, if preprocessor control stack for the current // file is not empty. - if (!PrepIncludeStack.back()->empty()) { + if (!PrepIncludeStack.back().empty()) { prepReportPreprocessorStackError(); return false; } // Pop the preprocessing controls from the include stack. - if (PrepIncludeStack.empty()) { - PrintFatalError("preprocessor include stack is empty"); - } - PrepIncludeStack.pop_back(); if (IncludeStackMustBeEmpty) { @@ -761,7 +755,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, // Regardless of whether we are processing tokens or not, // we put the #ifdef control on stack. // Note that MacroIsDefined has been canonicalized against ifdef. - PrepIncludeStack.back()->push_back( + PrepIncludeStack.back().push_back( {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); if (!prepSkipDirectiveEnd()) @@ -789,10 +783,10 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, } else if (Kind == tgtok::Else) { // Check if this #else is correct before calling prepSkipDirectiveEnd(), // which will move CurPtr away from the beginning of #else. - if (PrepIncludeStack.back()->empty()) + if (PrepIncludeStack.back().empty()) return ReturnError(TokStart, "#else without #ifdef or #ifndef"); - PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); + PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back(); if (IfdefEntry.Kind != tgtok::Ifdef) { PrintError(TokStart, "double #else"); @@ -801,9 +795,8 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, // Replace the corresponding #ifdef's control with its negation // on the control stack. - PrepIncludeStack.back()->pop_back(); - PrepIncludeStack.back()->push_back( - {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); + PrepIncludeStack.back().back() = {Kind, !IfdefEntry.IsDefined, + SMLoc::getFromPointer(TokStart)}; if (!prepSkipDirectiveEnd()) return ReturnError(CurPtr, "only comments are supported after #else"); @@ -822,10 +815,10 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, } else if (Kind == tgtok::Endif) { // Check if this #endif is correct before calling prepSkipDirectiveEnd(), // which will move CurPtr away from the beginning of #endif. - if (PrepIncludeStack.back()->empty()) + if (PrepIncludeStack.back().empty()) return ReturnError(TokStart, "#endif without #ifdef"); - auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); + auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); if (IfdefOrElseEntry.Kind != tgtok::Ifdef && IfdefOrElseEntry.Kind != tgtok::Else) { @@ -836,7 +829,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, if (!prepSkipDirectiveEnd()) return ReturnError(CurPtr, "only comments are supported after #endif"); - PrepIncludeStack.back()->pop_back(); + PrepIncludeStack.back().pop_back(); // If we were processing tokens before this #endif, then // we should continue it. @@ -1055,20 +1048,16 @@ bool TGLexer::prepSkipDirectiveEnd() { } bool TGLexer::prepIsProcessingEnabled() { - for (const PreprocessorControlDesc &I : - llvm::reverse(*PrepIncludeStack.back())) - if (!I.IsDefined) - return false; - - return true; + return all_of(PrepIncludeStack.back(), + [](const PreprocessorControlDesc &I) { return I.IsDefined; }); } void TGLexer::prepReportPreprocessorStackError() { - if (PrepIncludeStack.back()->empty()) + if (PrepIncludeStack.back().empty()) PrintFatalError("prepReportPreprocessorStackError() called with " "empty control stack"); - auto &PrepControl = PrepIncludeStack.back()->back(); + auto &PrepControl = PrepIncludeStack.back().back(); PrintError(CurBuf.end(), "reached EOF without matching #endif"); PrintError(PrepControl.SrcPos, "the latest preprocessor control is here"); diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index 963d75e52cc8f..f8b32dc5377f5 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H #define LLVM_LIB_TABLEGEN_TGLEXER_H +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/DataTypes.h" @@ -21,7 +22,6 @@ #include #include #include -#include namespace llvm { template class ArrayRef; @@ -323,8 +323,7 @@ class TGLexer { // preprocessing control stacks for the current file and all its // parent files. The back() element is the preprocessing control // stack for the current file. - std::vector>> - PrepIncludeStack; + SmallVector> PrepIncludeStack; // Validate that the current preprocessing control stack is empty, // since we are about to exit a file, and pop the include stack. From f7420a9dff6d09715042b60c9e26a40a1b2a3147 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Fri, 3 Jan 2025 19:41:48 +0000 Subject: [PATCH 117/480] [flang][debug] Fix issue with argument numbering. (#120726) Currently fir::isDummyArgument is being used to check if a DeclareOp represents a dummy argument. The argument passed to the function is declOp.getMemref(). This bypasses the code in isDummyArgument that checks for dummy_scope because the `Value` returned by the getMemref() may not have DeclareOp as its defining op. This bypassing mean that sometime a variable will be marked as argument when it should not. This happened in this case where same arg was being used for 2 different result variables with use of `entry` in the function. The solution is to check directly if the declOp has a dummy_scope. If yes, we know this is dummy argument. We can now check if the memref points to the BlockArgument and use its number. This will still miss arguments where memref does not directly point to a BlockArgument but that is missed currently too. Note that we can still evaluate those variable in debugger. It is just that they are not marked as arguments. Fixes #116525. --- flang/lib/Optimizer/Transforms/AddDebugInfo.cpp | 6 +++--- flang/test/Integration/debug-116525.f90 | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 flang/test/Integration/debug-116525.f90 diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 3a437c7a0f013..a8e9d198ccb97 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -121,9 +121,9 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, // constant attribute of [hl]fir.declare/fircg.ext_declare operation that has // a dummy_scope operand). unsigned argNo = 0; - if (fir::isDummyArgument(declOp.getMemref())) { - auto arg = llvm::cast(declOp.getMemref()); - argNo = arg.getArgNumber() + 1; + if (declOp.getDummyScope()) { + if (auto arg = llvm::dyn_cast(declOp.getMemref())) + argNo = arg.getArgNumber() + 1; } auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()), diff --git a/flang/test/Integration/debug-116525.f90 b/flang/test/Integration/debug-116525.f90 new file mode 100644 index 0000000000000..1916a34df4c12 --- /dev/null +++ b/flang/test/Integration/debug-116525.f90 @@ -0,0 +1,12 @@ +! RUN: %flang_fc1 -fopenmp -emit-llvm -debug-info-kind=standalone %s -o - + +! Test that this does not cause build failure. +function s(x) + character(len=2) :: x, s, ss + + s = x + + entry ss() + +end function s + From 3b72c62e7faa918d9a7e7439a4aa86d00921b0b8 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 14:42:39 -0500 Subject: [PATCH 118/480] [AMDGPU][True16][MC] true16 for v_frexp_mant_f16 (#120653) Support true16 format for v_frexp_mant_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 262 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++--- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 +++-- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 +- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++--- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 +++-- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 +++-- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 +- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 ++++- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 +++- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 23 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 +++- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 +- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 +++- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 58 ++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 55 +++- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 25 +- 29 files changed, 1298 insertions(+), 465 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index badca264e8f92..79f0caec418ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1036,7 +1036,7 @@ defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16" defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">; defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; -defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; +defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">; defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index b9fef0834cb24..88ef7a9363930 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -3,11 +3,13 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i32: @@ -50,6 +52,19 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -96,6 +111,16 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -145,6 +170,18 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -221,6 +258,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX11-NEXT: v_bfe_i32 v2, v4, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_frexp_mant_f16_e32 v3, v1 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v4, v1 +; GFX12-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_pack_b32_f16 v0, v2, v3 +; GFX12-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -311,6 +367,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_frexp_mant_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -386,6 +456,22 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) { ; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v1 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -463,6 +549,19 @@ define { half, i16 } @test_frexp_f16_i16(half %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -509,6 +608,16 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) { ; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -554,6 +663,16 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) { ; GFX11-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -623,6 +742,19 @@ define { float, i32 } @test_frexp_f32_i32(float %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -665,6 +797,16 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) { ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -706,6 +848,16 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) { ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -771,6 +923,21 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) { ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v5, v1 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -846,6 +1013,17 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) { ; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -896,6 +1074,17 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) { ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -954,6 +1143,19 @@ define { double, i32 } @test_frexp_f64_i32(double %a) { ; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[3:4], v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1000,6 +1202,16 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1044,6 +1256,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) { ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1116,6 +1338,22 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { ; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f64_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[8:9], v[0:1] +; GFX12-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v5, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX12-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1174,6 +1412,17 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 @@ -1213,6 +1462,17 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) { ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1 ret <2 x i32> %result.1 @@ -1235,3 +1495,5 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ; GCN: {{.*}} ; GFX11-GISEL: {{.*}} ; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 9b9837b46b26d..b98955d268a72 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2351,50 +2351,65 @@ v_frexp_exp_i32_f64 v5, src_scc v_frexp_exp_i32_f64 v255, 0xaf123456 // GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16 v5, v1 -// GFX11: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v1.l +// GFX11: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, v127 -// GFX11: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v127.l +// GFX11: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, s1 -// GFX11: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s1 +// GFX11: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, s105 -// GFX11: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s105 +// GFX11: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_lo -// GFX11: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_lo +// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_hi -// GFX11: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_hi +// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, ttmp15 -// GFX11: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, ttmp15 +// GFX11: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, m0 -// GFX11: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, m0 +// GFX11: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_lo -// GFX11: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_lo +// GFX11: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_hi -// GFX11: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_hi +// GFX11: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, null -// GFX11: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, null +// GFX11: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, -1 -// GFX11: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, -1 +// GFX11: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, 0.5 -// GFX11: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, 0.5 +// GFX11: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, src_scc -// GFX11: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, src_scc +// GFX11: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v127, 0xfe0b -// GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v127.l, 0xfe0b +// GFX11: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16 v5.l, v1.h +// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v5.l, v127.h +// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v127.l, 0.5 +// GFX11: v_frexp_mant_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xb2,0xfe,0x7e] + +v_frexp_mant_f16 v5.h, src_scc +// GFX11: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f] + +v_frexp_mant_f16 v127.h, 0xfe0b +// GFX11: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32 v5, v1 // GFX11: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index b080bd9fca461..f46abd344d607 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -1766,47 +1766,56 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16 v5, v1 row_mirror -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_mirror +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_half_mirror -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_half_mirror +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:15 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:15 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:15 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:15 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:15 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:15 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_frexp_mant_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 6a47dce49ed2a..c5df74758d71e 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -416,14 +416,23 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 34f10c98e1468..ca181f1e59db5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -521,6 +521,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_frexp_mant_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_frexp_mant_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -530,6 +536,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -539,6 +563,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction + v_log_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 9e424fbd004e4..a0a07a03e14c3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1406,71 +1406,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX11: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_frexp_mant_f16 v128, 0xfe0b -// GFX11: v_frexp_mant_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v128.h, 0xfe0b +// GFX11: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, -1 -// GFX11: v_frexp_mant_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16 v128.l, 0xfe0b +// GFX11: v_frexp_mant_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, 0.5 -// GFX11: v_frexp_mant_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, -1 +// GFX11: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_hi -// GFX11: v_frexp_mant_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, 0.5 +// GFX11: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_lo -// GFX11: v_frexp_mant_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_hi +// GFX11: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16 v255, m0 -// GFX11: v_frexp_mant_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_lo +// GFX11: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16 v255, null -// GFX11: v_frexp_mant_f16_e64 v255, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, m0 +// GFX11: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s1 -// GFX11: v_frexp_mant_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, null +// GFX11: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s105 -// GFX11: v_frexp_mant_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s1 +// GFX11: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16 v255, src_scc -// GFX11: v_frexp_mant_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s105 +// GFX11: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16 v255, ttmp15 -// GFX11: v_frexp_mant_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, src_scc +// GFX11: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 -// GFX11: v_frexp_mant_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, ttmp15 +// GFX11: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v1.h +// GFX11: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v255, v127 -// GFX11: v_frexp_mant_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v127.h +// GFX11: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_frexp_mant_f16 v255, vcc_hi -// GFX11: v_frexp_mant_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, vcc_lo -// GFX11: v_frexp_mant_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_hi +// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 -// GFX11: v_frexp_mant_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_lo +// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_frexp_mant_f16 v255.l, -1 +// GFX11: v_frexp_mant_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.l, 0.5 +// GFX11: v_frexp_mant_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_hi +// GFX11: v_frexp_mant_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_lo +// GFX11: v_frexp_mant_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, m0 +// GFX11: v_frexp_mant_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, null +// GFX11: v_frexp_mant_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s1 +// GFX11: v_frexp_mant_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s105 +// GFX11: v_frexp_mant_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, src_scc +// GFX11: v_frexp_mant_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, ttmp15 +// GFX11: v_frexp_mant_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l +// GFX11: v_frexp_mant_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, v127.l +// GFX11: v_frexp_mant_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, vcc_hi +// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, vcc_lo +// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h +// GFX11: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_frexp_mant_f16 v5.l, v199.l +// GFX11: v_frexp_mant_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_log_f16 v128, 0xfe0b // GFX11: v_log_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 3992b869c46d5..1a7eb2c23a7d2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -1861,47 +1861,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index a123c73c73bcb..73c21ce24d15c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -538,17 +538,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 4b055165871cf..860c0f4eca7b3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -2347,50 +2347,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc| v_frexp_exp_i32_f64_e64 v255, 0xaf123456 // GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16_e64 v5, v1 -// GFX11: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v1.l +// GFX11: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, v255 -// GFX11: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v255.l +// GFX11: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, s1 -// GFX11: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s1 +// GFX11: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, s105 -// GFX11: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s105 +// GFX11: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_lo -// GFX11: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_lo +// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_hi -// GFX11: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_hi +// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, ttmp15 -// GFX11: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, ttmp15 +// GFX11: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, m0 -// GFX11: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, m0 +// GFX11: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_lo -// GFX11: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_lo +// GFX11: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_hi -// GFX11: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_hi +// GFX11: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, null -// GFX11: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, null +// GFX11: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, -1 -// GFX11: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, -1 +// GFX11: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, 0.5 mul:2 -// GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] -v_frexp_mant_f16_e64 v5, src_scc mul:4 -// GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +v_frexp_mant_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] -v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32_e64 v5, v1 // GFX11: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index ed90e480012c0..0195c34a552e3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2432,50 +2432,62 @@ v_frexp_exp_i32_f64 v5, src_scc v_frexp_exp_i32_f64 v255, 0xaf123456 // GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16 v5, v1 -// GFX12: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v1.l +// GFX12: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, v127 -// GFX12: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] +v_frexp_mant_f16 v5.l, v127.l +// GFX12: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e] -v_frexp_mant_f16 v5, s1 -// GFX12: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s1 +// GFX12: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, s105 -// GFX12: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, s105 +// GFX12: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_lo -// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_lo +// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, vcc_hi -// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, vcc_hi +// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, ttmp15 -// GFX12: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, ttmp15 +// GFX12: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, m0 -// GFX12: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, m0 +// GFX12: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_lo -// GFX12: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_lo +// GFX12: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, exec_hi -// GFX12: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, exec_hi +// GFX12: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, null -// GFX12: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, null +// GFX12: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, -1 -// GFX12: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, -1 +// GFX12: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, 0.5 -// GFX12: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, 0.5 +// GFX12: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v5, src_scc -// GFX12: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +v_frexp_mant_f16 v5.l, src_scc +// GFX12: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] -v_frexp_mant_f16 v127, 0xfe0b -// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v127.l, 0xfe0b +// GFX12: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16 v5.l, v1.h +// GFX12: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v5.l, v127.h +// GFX12: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e] + +v_frexp_mant_f16 v5.h, src_scc +// GFX12: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f] + +v_frexp_mant_f16 v127.h, 0xfe0b +// GFX12: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32 v5, v1 // GFX12: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 90968055e2a82..072544e66e4a5 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -1834,47 +1834,53 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16 v5, v1 row_mirror -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_mirror +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_half_mirror -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_half_mirror +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shl:15 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shl:15 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_shr:15 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_shr:15 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_ror:15 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_ror:15 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 0ce0087918f56..bc3559e3c65ed 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -448,14 +448,20 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 92a0d15bbc6f0..0d759baf0af0d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -494,6 +494,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_frexp_mant_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_frexp_mant_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -503,6 +509,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction + v_frexp_mant_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -512,6 +536,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction +v_frexp_mant_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction + v_log_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index bbe7b65d03281..976b6bb69c33e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1366,71 +1366,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX12: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_frexp_mant_f16 v128, 0xfe0b -// GFX12: v_frexp_mant_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16 v128.h, 0xfe0b +// GFX12: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, -1 -// GFX12: v_frexp_mant_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16 v128.l, 0xfe0b +// GFX12: v_frexp_mant_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_frexp_mant_f16 v255, 0.5 -// GFX12: v_frexp_mant_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, -1 +// GFX12: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_hi -// GFX12: v_frexp_mant_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, 0.5 +// GFX12: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00] -v_frexp_mant_f16 v255, exec_lo -// GFX12: v_frexp_mant_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_hi +// GFX12: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16 v255, m0 -// GFX12: v_frexp_mant_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, exec_lo +// GFX12: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16 v255, null -// GFX12: v_frexp_mant_f16_e64 v255, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, m0 +// GFX12: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s1 -// GFX12: v_frexp_mant_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, null +// GFX12: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16 v255, s105 -// GFX12: v_frexp_mant_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s1 +// GFX12: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16 v255, src_scc -// GFX12: v_frexp_mant_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, s105 +// GFX12: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16 v255, ttmp15 -// GFX12: v_frexp_mant_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, src_scc +// GFX12: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 -// GFX12: v_frexp_mant_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, ttmp15 +// GFX12: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v1.h +// GFX12: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16 v255, v127 -// GFX12: v_frexp_mant_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_frexp_mant_f16 v255.h, v127.h +// GFX12: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00] -v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_frexp_mant_f16 v255, vcc_hi -// GFX12: v_frexp_mant_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_frexp_mant_f16 v255, vcc_lo -// GFX12: v_frexp_mant_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_hi +// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 -// GFX12: v_frexp_mant_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] +v_frexp_mant_f16 v255.h, vcc_lo +// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_frexp_mant_f16 v255.l, -1 +// GFX12: v_frexp_mant_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_frexp_mant_f16 v255.l, 0.5 +// GFX12: v_frexp_mant_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_hi +// GFX12: v_frexp_mant_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, exec_lo +// GFX12: v_frexp_mant_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, m0 +// GFX12: v_frexp_mant_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, null +// GFX12: v_frexp_mant_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s1 +// GFX12: v_frexp_mant_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, s105 +// GFX12: v_frexp_mant_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, src_scc +// GFX12: v_frexp_mant_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, ttmp15 +// GFX12: v_frexp_mant_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l +// GFX12: v_frexp_mant_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, v127.l +// GFX12: v_frexp_mant_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00] + +v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_frexp_mant_f16 v255.l, vcc_hi +// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f16 v255.l, vcc_lo +// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h +// GFX12: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_frexp_mant_f16 v5.l, v199.l +// GFX12: v_frexp_mant_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00] + +v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_log_f16 v128, 0xfe0b // GFX12: v_log_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 5af15f2eb971f..e4f62eadc0e49 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -2497,50 +2497,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc| v_frexp_exp_i32_f64_e64 v255, 0xaf123456 // GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_frexp_mant_f16_e64 v5, v1 -// GFX12: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v1.l +// GFX12: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, v255 -// GFX12: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, v255.l +// GFX12: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] -v_frexp_mant_f16_e64 v5, s1 -// GFX12: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s1 +// GFX12: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, s105 -// GFX12: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, s105 +// GFX12: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_lo -// GFX12: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_lo +// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, vcc_hi -// GFX12: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, vcc_hi +// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, ttmp15 -// GFX12: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, ttmp15 +// GFX12: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, m0 -// GFX12: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, m0 +// GFX12: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_lo -// GFX12: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_lo +// GFX12: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, exec_hi -// GFX12: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, exec_hi +// GFX12: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, null -// GFX12: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, null +// GFX12: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, -1 -// GFX12: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +v_frexp_mant_f16_e64 v5.l, -1 +// GFX12: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] -v_frexp_mant_f16_e64 v5, 0.5 mul:2 -// GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] -v_frexp_mant_f16_e64 v5, src_scc mul:4 -// GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +v_frexp_mant_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] -v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16_e64 v5.h, v1.h +// GFX12: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5.l, v255.h +// GFX12: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_frexp_mant_f32_e64 v5, v1 // GFX12: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 39638cefd44ad..fb57e5cd54ab8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1876,47 +1876,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index a6cef6f134b0a..acb73d8dbaf73 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -553,17 +553,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 0abced9f2f77b..55b2081c04917 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -2290,49 +2290,82 @@ # GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xb3,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e] 0x7f,0xb3,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e] 0x01,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e] 0x69,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e] 0x6a,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e] 0x6b,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e] 0x7b,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e] 0x7d,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e] 0x7e,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e] 0x7f,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e] 0x7c,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e] 0xc1,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e] 0xf0,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e] 0xfd,0xb2,0x0a,0x7e -# GFX11: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e] 0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xb3,0x0a,0x7e +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xb3,0x0a,0x7e] + +0xff,0xb3,0x0a,0x7e +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xb3,0x0a,0x7e] + +0xf0,0xb2,0xfe,0x7e +# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xb2,0xfe,0x7e] +# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0.5 ; encoding: [0xf0,0xb2,0xfe,0x7e] + +0xfd,0xb2,0x0a,0x7f +# GFX11-REAL16: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f] + +0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x81,0x0a,0x7e # GFX11: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index 7043f3b2b9f29..d2e1e926cc19e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -1727,46 +1727,72 @@ # GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index d2eb919849fd3..93fb5e2b4c01a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -335,14 +335,33 @@ # GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] +0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] +0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] + 0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00 # GFX11: v_frexp_mant_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 5c3fde7b80556..74d875081d113 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1853,46 +1853,72 @@ # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 28b39f4b0344a..a4bdfe9f4a975 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -505,16 +505,32 @@ # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index d078bc2b8cb04..7c4f1634026fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -2316,49 +2316,76 @@ # GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 46dedd970a320..24dc882e8beb0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1839,46 +1839,68 @@ # GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 551dab7ec3e7c..2eeb220b913fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -348,10 +348,19 @@ # GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 0d01be721e60d..661d072f46c1a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -2368,50 +2368,76 @@ # GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] - +# GFX12-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +# GFX11: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index d501d62c006eb..a1e431bc49d34 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1859,47 +1859,72 @@ # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] - +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index aba7d3ff43d8b..405b716c110e1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -523,17 +523,32 @@ # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 9d8e634e85ca46fbec07733d3e69d34c0d7814ac Mon Sep 17 00:00:00 2001 From: Jeff Niu Date: Fri, 3 Jan 2025 11:44:46 -0800 Subject: [PATCH 119/480] [mlir][scf] Always remove for iter args that are loop invariant (#121555) This alters the condition in ForOpIterArgsFolder to always remove iter args when their initial value equals the yielded value, not just when the arg has no use. --- mlir/lib/Dialect/SCF/IR/SCF.cpp | 31 +++++++++++-------------- mlir/test/Dialect/SCF/canonicalize.mlir | 22 ++++++++++++++---- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index eded1c394f126..83ae79ce48266 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -839,8 +839,7 @@ mlir::scf::replaceAndCastForOpIterArg(RewriterBase &rewriter, scf::ForOp forOp, namespace { // Fold away ForOp iter arguments when: // 1) The op yields the iter arguments. -// 2) The iter arguments have no use and the corresponding outer region -// iterators (inputs) are yielded. +// 2) The argument's corresponding outer region iterators (inputs) are yielded. // 3) The iter arguments have no use and the corresponding (operation) results // have no use. // @@ -872,30 +871,28 @@ struct ForOpIterArgsFolder : public OpRewritePattern { newIterArgs.reserve(forOp.getInitArgs().size()); newYieldValues.reserve(numResults); newResultValues.reserve(numResults); - for (auto it : llvm::zip(forOp.getInitArgs(), // iter from outside - forOp.getRegionIterArgs(), // iter inside region - forOp.getResults(), // op results - forOp.getYieldedValues() // iter yield - )) { + for (auto [init, arg, result, yielded] : + llvm::zip(forOp.getInitArgs(), // iter from outside + forOp.getRegionIterArgs(), // iter inside region + forOp.getResults(), // op results + forOp.getYieldedValues() // iter yield + )) { // Forwarded is `true` when: // 1) The region `iter` argument is yielded. - // 2) The region `iter` argument has no use, and the corresponding iter - // operand (input) is yielded. + // 2) The region `iter` argument the corresponding input is yielded. // 3) The region `iter` argument has no use, and the corresponding op // result has no use. - bool forwarded = ((std::get<1>(it) == std::get<3>(it)) || - (std::get<1>(it).use_empty() && - (std::get<0>(it) == std::get<3>(it) || - std::get<2>(it).use_empty()))); + bool forwarded = (arg == yielded) || (init == yielded) || + (arg.use_empty() && result.use_empty()); keepMask.push_back(!forwarded); canonicalize |= forwarded; if (forwarded) { - newBlockTransferArgs.push_back(std::get<0>(it)); - newResultValues.push_back(std::get<0>(it)); + newBlockTransferArgs.push_back(init); + newResultValues.push_back(init); continue; } - newIterArgs.push_back(std::get<0>(it)); - newYieldValues.push_back(std::get<3>(it)); + newIterArgs.push_back(init); + newYieldValues.push_back(yielded); newBlockTransferArgs.push_back(Value()); // placeholder with null value newResultValues.push_back(Value()); // placeholder with null value } diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 8c4e7a41ee6bc..828758df6d31c 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -408,6 +408,20 @@ func.func @for_yields_4() -> i32 { // ----- +// CHECK-LABEL: @constant_iter_arg +func.func @constant_iter_arg(%arg0: index, %arg1: index, %arg2: index) { + %c0_i32 = arith.constant 0 : i32 + // CHECK: scf.for %arg3 = %arg0 to %arg1 step %arg2 { + %0 = scf.for %i = %arg0 to %arg1 step %arg2 iter_args(%arg3 = %c0_i32) -> i32 { + // CHECK-NEXT: "test.use"(%c0_i32) + "test.use"(%arg3) : (i32) -> () + scf.yield %c0_i32 : i32 + } + return +} + +// ----- + // CHECK-LABEL: @replace_true_if func.func @replace_true_if() { %true = arith.constant true @@ -1789,7 +1803,7 @@ module { } // CHECK-LABEL: @fold_iter_args_not_being_modified_within_scfforall // CHECK-SAME: (%{{.*}}: index, %[[ARG1:.*]]: tensor, %[[ARG2:.*]]: tensor) -> (tensor, tensor) { -// CHECK: %[[RESULT:.*]] = scf.forall +// CHECK: %[[RESULT:.*]] = scf.forall // CHECK-SAME: shared_outs(%[[ITER_ARG_5:.*]] = %[[ARG2]]) -> (tensor) { // CHECK: %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]] // CHECK: %[[OPERAND1:.*]] = tensor.extract_slice %[[ITER_ARG_5]] @@ -1832,7 +1846,7 @@ module { } // CHECK-LABEL: @fold_iter_args_with_no_use_of_result_scfforall // CHECK-SAME: (%{{.*}}: index, %[[ARG1:.*]]: tensor, %[[ARG2:.*]]: tensor, %[[ARG3:.*]]: tensor) -> tensor { -// CHECK: %[[RESULT:.*]] = scf.forall +// CHECK: %[[RESULT:.*]] = scf.forall // CHECK-SAME: shared_outs(%[[ITER_ARG_6:.*]] = %[[ARG2]]) -> (tensor) { // CHECK: %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]] // CHECK: %[[OPERAND1:.*]] = tensor.extract_slice %[[ARG3]] @@ -1856,7 +1870,7 @@ func.func @index_switch_fold() -> (f32, f32) { %y = arith.constant 42.0 : f32 scf.yield %y : f32 } - + %switch_cst_2 = arith.constant 2: index %1 = scf.index_switch %switch_cst_2 -> f32 case 0 { @@ -1867,7 +1881,7 @@ func.func @index_switch_fold() -> (f32, f32) { %y = arith.constant 42.0 : f32 scf.yield %y : f32 } - + return %0, %1 : f32, f32 } From b2adeae8650fb720873ad7fa39153beaa8194afc Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 3 Jan 2025 11:49:51 -0800 Subject: [PATCH 120/480] [AMDGPU][MC] Allow null where 128b or larger dst reg is expected (#115200) For GFX10+, currently null cannot be used as dst reg in instructions that expect the dst reg to be 128b or larger (e.g., s_load_dwordx4). This patch fixes this problem while ensuring null cannot be used as S#, T#, or V#. --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 8 +- llvm/lib/Target/AMDGPU/BUFInstructions.td | 6 +- .../Disassembler/AMDGPUDisassembler.cpp | 25 ++ .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + llvm/lib/Target/AMDGPU/MIMGInstructions.td | 56 ++--- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 19 +- llvm/lib/Target/AMDGPU/SMInstructions.td | 72 +++--- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 8 +- llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s | 127 ++++++++++ llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s | 49 ++++ llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s | 160 ++++++++++++ llvm/test/MC/AMDGPU/gfx10_asm_smem.s | 16 ++ llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s | 86 +++++++ llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s | 117 +++++++++ llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s | 49 ++++ llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s | 229 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_asm_smem.s | 16 ++ llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s | 31 +++ llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s | 119 +++++++++ llvm/test/MC/AMDGPU/gfx12_asm_smem.s | 19 ++ llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s | 31 +++ .../MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s | 49 ++++ .../MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s | 220 +++++++++++++++++ .../MC/Disassembler/AMDGPU/gfx10_smem.txt | 15 ++ .../Disassembler/AMDGPU/gfx11_dasm_smem.txt | 15 ++ .../Disassembler/AMDGPU/gfx12_dasm_smem.txt | 18 ++ 26 files changed, 1485 insertions(+), 76 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ed956a1f755c0..d8f441d1ccfe4 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9760,10 +9760,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, case MCK_SReg_64: case MCK_SReg_64_XEXEC: // Null is defined as a 32-bit register but - // it should also be enabled with 64-bit operands. - // The following code enables it for SReg_64 operands + // it should also be enabled with 64-bit operands or larger. + // The following code enables it for SReg_64 and larger operands // used as source and destination. Remaining source // operands are handled in isInlinableImm. + case MCK_SReg_96: + case MCK_SReg_128: + case MCK_SReg_256: + case MCK_SReg_512: return Operand.isNull() ? Match_Success : Match_InvalidOperand; default: return Match_InvalidOperand; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a351f451584f9..88205ea361c55 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -168,7 +168,7 @@ class getMTBUFInsDA vdataList, dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, FORMAT:$format, CPol_0:$cpol, i1imm_0:$swz)); dag Inputs = !if(!empty(vaddrList), @@ -418,7 +418,7 @@ class getMUBUFInsDA vdataList, RegisterOperand vdata_op = getLdStVDataRegisterOperand.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); @@ -703,7 +703,7 @@ class getMUBUFAtomicInsDA : MIMG_gfx6789 { - let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -435,7 +435,7 @@ class MIMG_NoSampler_Helper_gfx90a : MIMG_gfx90a .ret:$vdata), dns> { - let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -447,7 +447,7 @@ class MIMG_NoSampler_gfx10 : MIMG_gfx10 { - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -460,7 +460,7 @@ class MIMG_NoSampler_nsa_gfx10 : MIMG_nsa_gfx10 { let InOperandList = !con(AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -472,7 +472,7 @@ class MIMG_NoSampler_gfx11 : MIMG_gfx11 { - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -485,7 +485,7 @@ class MIMG_NoSampler_nsa_gfx11 : MIMG_nsa_gfx11 { let InOperandList = !con(AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -498,7 +498,7 @@ class VIMAGE_NoSampler_gfx12 : VIMAGE_gfx12 { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" @@ -510,8 +510,8 @@ class VSAMPLE_Sampler_gfx12 : VSAMPLE_gfx12 { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc), - !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins SReg_256_XNULL:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)), (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), @@ -527,8 +527,8 @@ class VSAMPLE_Sampler_nortn_gfx12 : VSAMPLE_gfx12 { let InOperandList = !con(AddrIns, - (ins SReg_256:$rsrc), - !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins SReg_256_XNULL:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)), (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), @@ -679,7 +679,7 @@ class MIMG_Store_Helper : MIMG_gfx6789 { - let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -693,7 +693,7 @@ class MIMG_Store_Helper_gfx90a : MIMG_gfx90a { let InOperandList = !con((ins getLdStRegisterOperand.ret:$vdata, - addr_rc:$vaddr, SReg_256:$srsrc, + addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -705,7 +705,7 @@ class MIMG_Store_gfx10 : MIMG_gfx10 { - let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -719,7 +719,7 @@ class MIMG_Store_nsa_gfx10 { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -731,7 +731,7 @@ class MIMG_Store_gfx11 : MIMG_gfx11 { - let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -745,7 +745,7 @@ class MIMG_Store_nsa_gfx11 { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$srsrc, DMask:$dmask, + (ins SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -759,7 +759,7 @@ class VIMAGE_Store_gfx12 { let InOperandList = !con((ins DataRC:$vdata), AddrIns, - (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" @@ -875,7 +875,7 @@ class MIMG_Atomic_gfx6789_base op, string asm, RegisterClass data_rc, : MIMG_gfx6789 { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; @@ -887,7 +887,7 @@ class MIMG_Atomic_gfx90a_base op, string asm, RegisterClass data_rc, let Constraints = "$vdst = $vdata"; let InOperandList = (ins getLdStRegisterOperand.ret:$vdata, - addr_rc:$vaddr, SReg_256:$srsrc, + addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; @@ -921,7 +921,7 @@ class MIMG_Atomic_gfx10 { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -936,7 +936,7 @@ class MIMG_Atomic_nsa_gfx10 { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; @@ -964,7 +964,7 @@ class MIMG_Atomic_nsa_gfx11 : MIMG_gfx6789 { - let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -1139,7 +1139,7 @@ class MIMG_Sampler_Helper : MIMG_gfx90a.ret:$vdata), dns> { - let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); @@ -1149,7 +1149,7 @@ class MIMG_Sampler_gfx90a { dag ret = !con(OpPrefix, - (ins SReg_256:$srsrc, SReg_128:$ssamp, + (ins SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe), !if(HasD16, (ins D16:$d16), (ins))); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 16a7a9cfbc49a..f3a962eea7539 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -809,6 +809,9 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, let BaseClassOrder = 32; } +def SGPR_NULL128 : SIReg<"null">; +def SGPR_NULL256 : SIReg<"null">; + let GeneratePressureSet = 0 in { def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add SReg_32, LDS_DIRECT_CLASS)> { @@ -885,6 +888,7 @@ multiclass SRegClass regTypes, SIRegisterTuples regList, SIRegisterTuples ttmpList = regList, + bit hasNull = 0, int copyCost = !sra(!add(numRegs, 1), 1)> { defvar hasTTMP = !ne(regList, ttmpList); defvar suffix = !cast(!mul(numRegs, 32)); @@ -901,7 +905,7 @@ multiclass SRegClass(sgprName)), !if(hasTTMP, @@ -910,15 +914,24 @@ multiclass SRegClass("SReg_" # suffix # "_XNULL"), !cast("SGPR_NULL" # suffix))> { + let isAllocatable = 0; + let BaseClassOrder = !mul(numRegs, 32); + } + } } } defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs, /*hasNull*/ true>; defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs, /*hasNull*/ true>; defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>; defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>; defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 1aeb4e8b20e8f..60e4ce92ac25d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -332,15 +332,15 @@ defm S_LOAD_I16 : SM_Pseudo_Loads ; defm S_LOAD_U16 : SM_Pseudo_Loads ; let is_buffer = 1 in { -defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads ; // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on // SI/CI, bit disallowed for SMEM on VI. -defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads ; let SubtargetPredicate = HasScalarDwordx3Loads in - defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads ; -defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads ; -defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads ; -defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads ; + defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads ; +defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads ; defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads ; defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads ; defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads ; @@ -353,9 +353,9 @@ defm S_STORE_DWORDX2 : SM_Pseudo_Stores ; defm S_STORE_DWORDX4 : SM_Pseudo_Stores ; let is_buffer = 1 in { -defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores ; -defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores ; -defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores ; +defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores ; +defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores ; +defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores ; } } // End SubtargetPredicate = HasScalarStores @@ -401,33 +401,33 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores ; let SubtargetPredicate = HasScalarAtomics in { let is_buffer = 1 in { -defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics ; - -defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics ; -defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics ; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics ; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics ; } defm S_ATOMIC_SWAP : SM_Pseudo_Atomics ; diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index cf9fdbdc34391..2ceaca3497ece 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s index bd61ad3908d21..f6ea86ed7fe93 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s @@ -359,3 +359,130 @@ image_sample_c_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_ image_load v[0:1], v0, s[0:7] dmask:0x9 dim:1 D // NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid dim value + +// null is not allowed as SRSRC or SSAMP +image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_fcmpswap v[1:2], v[2:3], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_fmax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_fmin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s new file mode 100644 index 0000000000000..5eb2e9c579a7d --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s @@ -0,0 +1,49 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s + +tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s new file mode 100644 index 0000000000000..bd7acfeb4b033 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s @@ -0,0 +1,160 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s + +buffer_atomic_add v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_x2 v[5:8], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smax v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smax_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smin v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_smin_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umax v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umax_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umin v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_umin_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor v5, v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_x2 v[5:6], v0, null, s3 idxen +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dword v5, v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dwordx2 v[5:6], v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dwordx3 v[5:7], v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_dwordx4 v[5:8], v0, null, s3 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_sbyte v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_sshort v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_ubyte v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_ushort v5, v0, null, s3 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_byte v1, v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dword v1, v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dwordx2 v[1:2], v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dwordx3 v[1:3], v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_dwordx4 v[1:4], v0, null, s4 idxen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_hi_x v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_x v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_xy v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_xyz v[1:2], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_d16_xyzw v[1:3], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_x v1, v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s index b582de83a2f29..683a0195037cf 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s @@ -281,6 +281,22 @@ s_load_dwordx16 s[20:35], s[2:3], 0x1234 glc dlc s_load_dwordx16 s[20:35], s[2:3], s0 offset:0x12345 glc dlc // GFX10: encoding: [0x01,0x45,0x11,0xf4,0x45,0x23,0x01,0x00] +// null as dst +s_load_dword null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx2 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx4 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx8 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] + +s_load_dwordx16 null, s[2:3], s0 +// GFX10: encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] + s_buffer_load_dword s5, s[4:7], s0 // GFX10: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s new file mode 100644 index 0000000000000..670e97325355b --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s @@ -0,0 +1,86 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s + +s_buffer_atomic_add s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_add_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_and s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_cmpswap s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_cmpswap_x2 s[4:7], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_dec s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_dec_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_inc s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_inc_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_or s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_or_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smax s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smax_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smin s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_smin_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_sub s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_sub_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_swap s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umax s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umax_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umin s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_atomic_umin_x2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dword s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx2 s[4:5], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx4 s[4:7], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx8 s[4:11], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx16 s[4:19], null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_store_dword s4, null, s101 +// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s index 9bf72a11e5eed..9c614453c1ebd 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s @@ -400,3 +400,120 @@ image_store_pck v1, v[2:3], s[12:19] dmask:0x1 unorm image_store_mip_pck v1, v[2:3], s[12:19] dmask:0x0 unorm // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand +// null is not allowed as SRSRC or SSAMP +image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s new file mode 100644 index 0000000000000..3b69835c8eb51 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s @@ -0,0 +1,49 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s + +tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s new file mode 100644 index 0000000000000..d3d74467d8099 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s @@ -0,0 +1,229 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s + +buffer_atomic_add_f32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_f32 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_csub_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_f32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_f32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b32 v5, v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b32 v5, v0, null, s3 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_b32 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_i16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_u16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_lds_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b32 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_x v1, v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s index 1d6b947609075..e071c67f85891 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s @@ -239,6 +239,22 @@ s_load_b512 s[20:35], s[2:3], s0 glc dlc s_load_b512 s[20:35], s[2:3], 0x1234 glc dlc // GFX11: encoding: [0x01,0x65,0x10,0xf4,0x34,0x12,0x00,0xf8] +// null as dst +s_load_b32 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b64 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] + +s_load_b128 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] + +s_load_b256 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] + +s_load_b512 null, s[2:3], s0 +// GFX11: encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] + s_buffer_load_b32 s5, s[4:7], s0 // GFX11: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s new file mode 100644 index 0000000000000..da195b4a41182 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s @@ -0,0 +1,31 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s + +s_buffer_load_b32 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b64 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b128 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b256 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b512 s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dword s4, null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx2 s[4:5], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx4 s[4:7], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx8 s[4:11], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx16 s[4:19], null, s101 +// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s index a0d11c985c6b7..0f2cfc39e2ec8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s @@ -255,3 +255,122 @@ image_store_pck v5, v1, s[8:15] dmask:0x1 th:TH_STORE_NT image_store_mip_pck v5, [v0, v1], s[8:15] dmask:0x1 th:TH_STORE_NT // NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand + +// null is not allowed as SRSRC or SSAMP +image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_b v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_c v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_gather4_o v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_b v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_c v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], [v1, v2, v3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_d v[5:6], [v1, v2, v3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +image_sample_o v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s index 668f767661f68..2ef027459fa6a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s @@ -541,6 +541,25 @@ s_load_b512 s[20:35], s[2:3], m0 s_load_b512 s[20:35], s[2:3], 0x0 // GFX12: s_load_b512 s[20:35], s[2:3], 0x0 ; encoding: [0x01,0x85,0x00,0xf4,0x00,0x00,0x00,0xf8] +// null as dst +s_load_b32 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b64 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b96 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b128 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b256 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00] + +s_load_b512 null, s[2:3], s0 offset:0x0 +// GFX12: encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00] + s_buffer_load_b32 s5, s[4:7], s0 // GFX12: s_buffer_load_b32 s5, s[4:7], s0 offset:0x0 ; encoding: [0x42,0x01,0x02,0xf4,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s new file mode 100644 index 0000000000000..0f62c8b939991 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s @@ -0,0 +1,31 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s + +s_buffer_load_b32 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b64 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b128 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b256 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_b512 s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dword s4, null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx2 s[4:5], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx4 s[4:7], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx8 s[4:11], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +s_buffer_load_dwordx16 s[4:19], null, s101 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s new file mode 100644 index 0000000000000..040119ce892e6 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s @@ -0,0 +1,49 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s + +tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s new file mode 100644 index 0000000000000..2c9ce7a7efe21 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s @@ -0,0 +1,220 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s + +buffer_atomic_add_f32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_num_f32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_min_num_f32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_f16 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_clamp_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b32 v5, v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_b32 v5, v0, null, s3 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_i8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_load_u8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b128 v[3:6], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b32 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_x v1, v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095 +// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt index 890a64b22f399..95301677272c2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt @@ -1261,3 +1261,18 @@ # GFX10: s_store_dwordx4 s[96:99], s[4:5], s0 ; encoding: [0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00] 0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dword null, s[2:3], s0 ; encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx2 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx4 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx8 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00 + +# GFX10: s_load_dwordx16 null, s[2:3], s0 ; encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] +0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt index 8b49de5d89909..8396132a5b29c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt @@ -471,3 +471,18 @@ # GFX11: s_gl1_inv ; encoding: [0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00] 0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b32 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b64 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b128 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b256 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00 + +# GFX11: s_load_b512 null, s[2:3], s0 ; encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt index 28decdd4c5b1e..02641e6eb97f0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt @@ -1277,3 +1277,21 @@ # GFX12: s_buffer_load_u16 s5, s[96:99], s0 offset:0x0 th:TH_LOAD_HT scope:SCOPE_SYS ; encoding: [0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00] 0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b32 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b64 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b96 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b128 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b256 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00 + +# GFX12: s_load_b512 null, s[2:3], s0 offset:0x0 ; encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00] +0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00 From 20d491bb993218eae6a13e4901da943ebd804f7a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Jan 2025 19:56:44 +0000 Subject: [PATCH 121/480] [VPlan] Remove re-using vector PH in VPBasicBlock::execute (NFC). Remove logic to re-use the previous basic block for the vector pre header from VPBasicBlock::execute. The preheader is now modeled as VPIRBasicBlock, so the code is no longer needed. Split off from https://github.com/llvm/llvm-project/pull/108378. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6d02efc05614a..06c36396a17f3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -487,11 +487,9 @@ void VPBasicBlock::execute(VPTransformState *State) { }; // 1. Create an IR basic block. - if (this == getPlan()->getVectorPreheader() || - (Replica && this == getParent()->getEntry()) || + if ((Replica && this == getParent()->getEntry()) || IsReplicateRegion(getSingleHierarchicalPredecessor())) { // Reuse the previous basic block if the current VPBB is either - // * the vector preheader, // * the entry to a replicate region, or // * the exit of a replicate region. State->CFG.VPBB2IRBB[this] = NewBB; From dc307be1b573c1bd6c2f8a3af9edd3455508dc7c Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 15:45:33 -0500 Subject: [PATCH 122/480] [AMDGPU][True16][MC] true16 for v_fract_f16 (#120647) Support true16 format for v_fract_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 428 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 ++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++-- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 +-- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 +- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 +-- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 +-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 ++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++-- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 +-- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 +-- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 +- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 ++- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 +- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 ++- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 +- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 58 ++- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 55 ++- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 25 +- 29 files changed, 1461 insertions(+), 462 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 79f0caec418ba..b58b7a5fcdcd0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1044,7 +1044,7 @@ defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16 defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">; defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; -defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; +defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index f6ee007facd7f..80b4d64b1236f 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -14,6 +14,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s ; Test patterns to match v_fract_* instructions. @@ -103,6 +104,21 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b32 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -181,6 +197,18 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f32_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -263,6 +291,22 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w ; GFX11-NEXT: v_min_f32_e32 v4, 0x3f7fffff, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_nan_check_math_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: v_min_num_f32_e32 v4, 0x3f7fffff, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -314,6 +358,16 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -362,6 +416,19 @@ define float @basic_fract_f32_flags_minnum(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_flags_minnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -409,6 +476,16 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_flags_fsub: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub nsz float %x, %floor @@ -467,6 +544,17 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) { ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: v_fract_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_v2f32_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: v_fract_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) %sub = fsub <2 x float> %x, %floor @@ -540,6 +628,20 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v3, v0, v3 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -588,6 +690,16 @@ define float @nnan_minnum_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_minnum_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -638,6 +750,19 @@ define float @nnan_fsub_fract_f32(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_fsub_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub nnan float %x, %floor @@ -686,6 +811,19 @@ define float @nnan_floor_fract_f32(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_floor_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call nnan float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -733,6 +871,16 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_src_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -782,6 +930,19 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7ffffe, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_wrong_const: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7ffffe, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -831,6 +992,19 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_swapped_fsub: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %floor, %x @@ -880,6 +1054,19 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_not_floor: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_trunc_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.trunc.f32(float %x) %sub = fsub float %x, %floor @@ -929,6 +1116,19 @@ define float @not_fract_f32_different_floor(float %x, float %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_different_floor: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %y) %sub = fsub float %x, %floor @@ -978,6 +1178,19 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_max_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_maxnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_max_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1000,6 +1213,15 @@ define float @fcmp_uno_check_is_nan_f32(float %x) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fcmp_uno_check_is_nan_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1054,6 +1276,16 @@ define float @select_nan_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1107,6 +1339,16 @@ define float @commuted_select_nan_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: commuted_select_nan_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1168,6 +1410,22 @@ define float @wrong_commuted_nan_select_f32(float %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: wrong_commuted_nan_select_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v1, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v1, 0x3f7fffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1231,6 +1489,16 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f16_nonan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -1313,6 +1581,20 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX11-NEXT: v_fract_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_v2f16_nonan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_fract_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) %sub = fsub <2 x half> %x, %floor @@ -1369,6 +1651,16 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f64_nanans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -1461,6 +1753,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture ; GFX11-NEXT: v_fract_f16_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f16_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f16_e32 v3, v0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: global_store_b16 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -1546,6 +1850,18 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap ; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f64_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -1600,6 +1916,16 @@ define float @select_nan_fract_f32_flags_select(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32_flags_select: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1653,6 +1979,16 @@ define float @select_nan_fract_f32_flags_minnum(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32_flags_minnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1769,6 +2105,25 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v6, v0 +; GFX12-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204 +; GFX12-NEXT: v_fract_f32_e32 v7, v1 +; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: v_floor_f32_e32 v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) %sub = fsub <2 x float> %x, %floor @@ -1881,6 +2236,21 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -2002,6 +2372,21 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b16 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f16_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| +; GFX12-NEXT: v_floor_f16_e32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: global_store_b16 v[1:2], v4, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -2168,6 +2553,29 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX11-NEXT: global_store_b32 v[1:2], v4, off ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: v_fract_f16_e32 v6, v0 +; GFX12-NEXT: v_floor_f16_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_fract_f16_e32 v4, v3 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 +; GFX12-NEXT: v_floor_f16_e32 v7, v3 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) %sub = fsub <2 x half> %x, %floor @@ -2311,6 +2719,26 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX11-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[10:11], v[0:1] +; GFX12-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 0x204 +; GFX12-NEXT: v_fract_f64_e32 v[12:13], v[2:3] +; GFX12-NEXT: v_cmp_class_f64_e64 s1, v[2:3], 0x204 +; GFX12-NEXT: v_floor_f64_e32 v[8:9], v[2:3] +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v0, v10, 0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v1, v11, 0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v12, 0, s1 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x) %sub = fsub <2 x double> %x, %floor diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index b98955d268a72..0d29ed985269a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -2093,50 +2093,65 @@ v_floor_f64 v[5:6], src_scc v_floor_f64 v[254:255], 0xaf123456 // GFX11: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_fract_f16 v5, v1 -// GFX11: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v1.l +// GFX11: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e] -v_fract_f16 v5, v127 -// GFX11: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v127.l +// GFX11: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e] -v_fract_f16 v5, s1 -// GFX11: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s1 +// GFX11: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] -v_fract_f16 v5, s105 -// GFX11: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s105 +// GFX11: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_lo -// GFX11: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_lo +// GFX11: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_hi -// GFX11: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_hi +// GFX11: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] -v_fract_f16 v5, ttmp15 -// GFX11: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, ttmp15 +// GFX11: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] -v_fract_f16 v5, m0 -// GFX11: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, m0 +// GFX11: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_lo -// GFX11: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_lo +// GFX11: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_hi -// GFX11: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_hi +// GFX11: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] -v_fract_f16 v5, null -// GFX11: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, null +// GFX11: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e] -v_fract_f16 v5, -1 -// GFX11: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, -1 +// GFX11: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] -v_fract_f16 v5, 0.5 -// GFX11: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, 0.5 +// GFX11: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] -v_fract_f16 v5, src_scc -// GFX11: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, src_scc +// GFX11: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] -v_fract_f16 v127, 0xfe0b -// GFX11: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_fract_f16 v127.l, 0xfe0b +// GFX11: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_fract_f16 v5.l, v1.h +// GFX11: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e] + +v_fract_f16 v5.l, v127.h +// GFX11: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e] + +v_fract_f16 v127.l, 0.5 +// GFX11: v_fract_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xbe,0xfe,0x7e] + +v_fract_f16 v5.h, src_scc +// GFX11: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f] + +v_fract_f16 v127.h, 0xfe0b +// GFX11: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_fract_f32 v5, v1 // GFX11: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index f46abd344d607..d4fb880f25b55 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -1598,47 +1598,56 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_fract_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_fract_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_fract_f16 v5, v1 row_mirror -// GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_fract_f16 v5.l, v1.l row_mirror +// GFX11: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_fract_f16 v5, v1 row_half_mirror -// GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_fract_f16 v5.l, v1.l row_half_mirror +// GFX11: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_fract_f16 v5, v1 row_shl:1 -// GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_fract_f16 v5, v1 row_shl:15 -// GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:15 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_fract_f16 v5, v1 row_shr:1 -// GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_fract_f16 v5, v1 row_shr:15 -// GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:15 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_fract_f16 v5, v1 row_ror:1 -// GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_fract_f16 v5, v1 row_ror:15 -// GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:15 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_fract_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_fract_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index c5df74758d71e..b6094c5ea3bd6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -377,14 +377,23 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_fract_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index ca181f1e59db5..98db7cc8bbc40 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -458,6 +458,12 @@ v_floor_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_fract_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_fract_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -467,6 +473,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -476,6 +500,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction + v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index a0a07a03e14c3..9de05d4a82465 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1208,71 +1208,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_floor_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_fract_f16 v128, 0xfe0b -// GFX11: v_fract_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_fract_f16 v128.h, 0xfe0b +// GFX11: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, -1 -// GFX11: v_fract_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16 v128.l, 0xfe0b +// GFX11: v_fract_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, 0.5 -// GFX11: v_fract_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] +v_fract_f16 v255.h, -1 +// GFX11: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v255, exec_hi -// GFX11: v_fract_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16 v255.h, 0.5 +// GFX11: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00] -v_fract_f16 v255, exec_lo -// GFX11: v_fract_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_hi +// GFX11: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16 v255, m0 -// GFX11: v_fract_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_lo +// GFX11: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16 v255, null -// GFX11: v_fract_f16_e64 v255, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16 v255.h, m0 +// GFX11: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16 v255, s1 -// GFX11: v_fract_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16 v255.h, null +// GFX11: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16 v255, s105 -// GFX11: v_fract_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16 v255.h, s1 +// GFX11: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16 v255, src_scc -// GFX11: v_fract_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] +v_fract_f16 v255.h, s105 +// GFX11: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16 v255, ttmp15 -// GFX11: v_fract_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16 v255.h, src_scc +// GFX11: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00] -v_fract_f16 v255, v1 -// GFX11: v_fract_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16 v255.h, ttmp15 +// GFX11: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16 v255.h, v1.h +// GFX11: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16 v255, v127 -// GFX11: v_fract_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] +v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_fract_f16 v255.h, v127.h +// GFX11: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00] -v_fract_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_fract_f16 v255, vcc_hi -// GFX11: v_fract_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_fract_f16 v255, vcc_lo -// GFX11: v_fract_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16 v255.h, vcc_hi +// GFX11: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16 v5, v199 -// GFX11: v_fract_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] +v_fract_f16 v255.h, vcc_lo +// GFX11: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_fract_f16 v255.l, -1 +// GFX11: v_fract_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_fract_f16 v255.l, 0.5 +// GFX11: v_fract_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_hi +// GFX11: v_fract_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_lo +// GFX11: v_fract_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f16 v255.l, m0 +// GFX11: v_fract_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f16 v255.l, null +// GFX11: v_fract_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f16 v255.l, s1 +// GFX11: v_fract_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f16 v255.l, s105 +// GFX11: v_fract_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f16 v255.l, src_scc +// GFX11: v_fract_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] + +v_fract_f16 v255.l, ttmp15 +// GFX11: v_fract_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f16 v255.l, v1.l +// GFX11: v_fract_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_fract_f16 v255.l, v127.l +// GFX11: v_fract_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] + +v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_fract_f16 v255.l, vcc_hi +// GFX11: v_fract_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f16 v255.l, vcc_lo +// GFX11: v_fract_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f16 v5.h, v199.h +// GFX11: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_fract_f16 v5.l, v199.l +// GFX11: v_fract_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_frexp_exp_i16_f16 v128.h, 0xfe0b // GFX11: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 1a7eb2c23a7d2..b674395fddf63 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -1684,47 +1684,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index 73c21ce24d15c..a07db726574e5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -487,17 +487,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 860c0f4eca7b3..964a19205df5c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -2086,50 +2086,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_fract_f16_e64 v5, v1 -// GFX11: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v1.l +// GFX11: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16_e64 v5, v255 -// GFX11: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v255.l +// GFX11: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] -v_fract_f16_e64 v5, s1 -// GFX11: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s1 +// GFX11: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16_e64 v5, s105 -// GFX11: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s105 +// GFX11: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_lo -// GFX11: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_lo +// GFX11: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_hi -// GFX11: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_hi +// GFX11: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16_e64 v5, ttmp15 -// GFX11: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, ttmp15 +// GFX11: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16_e64 v5, m0 -// GFX11: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, m0 +// GFX11: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_lo -// GFX11: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_lo +// GFX11: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_hi -// GFX11: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_hi +// GFX11: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16_e64 v5, null -// GFX11: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, null +// GFX11: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16_e64 v5, -1 -// GFX11: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, -1 +// GFX11: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16_e64 v5, 0.5 mul:2 -// GFX11: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +v_fract_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] -v_fract_f16_e64 v5, src_scc mul:4 -// GFX11: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +v_fract_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] -v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_fract_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_fract_f32_e64 v5, v1 // GFX11: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 0195c34a552e3..42b9dc464dd90 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -2168,50 +2168,62 @@ v_floor_f64 v[5:6], src_scc v_floor_f64 v[254:255], 0xaf123456 // GFX12: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_fract_f16 v5, v1 -// GFX12: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v1.l +// GFX12: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e] -v_fract_f16 v5, v127 -// GFX12: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] +v_fract_f16 v5.l, v127.l +// GFX12: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e] -v_fract_f16 v5, s1 -// GFX12: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s1 +// GFX12: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] -v_fract_f16 v5, s105 -// GFX12: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, s105 +// GFX12: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_lo -// GFX12: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_lo +// GFX12: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] -v_fract_f16 v5, vcc_hi -// GFX12: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, vcc_hi +// GFX12: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] -v_fract_f16 v5, ttmp15 -// GFX12: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, ttmp15 +// GFX12: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] -v_fract_f16 v5, m0 -// GFX12: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, m0 +// GFX12: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_lo -// GFX12: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_lo +// GFX12: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] -v_fract_f16 v5, exec_hi -// GFX12: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, exec_hi +// GFX12: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] -v_fract_f16 v5, null -// GFX12: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, null +// GFX12: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e] -v_fract_f16 v5, -1 -// GFX12: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, -1 +// GFX12: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] -v_fract_f16 v5, 0.5 -// GFX12: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, 0.5 +// GFX12: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] -v_fract_f16 v5, src_scc -// GFX12: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +v_fract_f16 v5.l, src_scc +// GFX12: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] -v_fract_f16 v127, 0xfe0b -// GFX12: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_fract_f16 v127.l, 0xfe0b +// GFX12: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_fract_f16 v5.l, v1.h +// GFX12: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e] + +v_fract_f16 v5.l, v127.h +// GFX12: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e] + +v_fract_f16 v5.h, src_scc +// GFX12: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f] + +v_fract_f16 v127.h, 0xfe0b +// GFX12: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_fract_f32 v5, v1 // GFX12: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 072544e66e4a5..7cfc2c1d45285 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -1660,47 +1660,53 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_fract_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_fract_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_fract_f16 v5, v1 row_mirror -// GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_fract_f16 v5.l, v1.l row_mirror +// GFX12: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_fract_f16 v5, v1 row_half_mirror -// GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_fract_f16 v5.l, v1.l row_half_mirror +// GFX12: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_fract_f16 v5, v1 row_shl:1 -// GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_fract_f16 v5, v1 row_shl:15 -// GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shl:15 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_fract_f16 v5, v1 row_shr:1 -// GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_fract_f16 v5, v1 row_shr:15 -// GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_shr:15 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_fract_f16 v5, v1 row_ror:1 -// GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_fract_f16 v5, v1 row_ror:15 -// GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_fract_f16 v5.l, v1.l row_ror:15 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_fract_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index bc3559e3c65ed..ddb4029f10208 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -406,14 +406,20 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 0d759baf0af0d..05a990eed89c8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -431,6 +431,12 @@ v_floor_f16_e32 v5, v199 quad_perm:[3,2,1,0] v_fract_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_fract_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -440,6 +446,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + +v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction + v_fract_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -449,6 +473,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction +v_fract_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 976b6bb69c33e..96de27842c072 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -1168,71 +1168,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_floor_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_fract_f16 v128, 0xfe0b -// GFX12: v_fract_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_fract_f16 v128.h, 0xfe0b +// GFX12: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, -1 -// GFX12: v_fract_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16 v128.l, 0xfe0b +// GFX12: v_fract_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_fract_f16 v255, 0.5 -// GFX12: v_fract_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] +v_fract_f16 v255.h, -1 +// GFX12: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v255, exec_hi -// GFX12: v_fract_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16 v255.h, 0.5 +// GFX12: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00] -v_fract_f16 v255, exec_lo -// GFX12: v_fract_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_hi +// GFX12: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16 v255, m0 -// GFX12: v_fract_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16 v255.h, exec_lo +// GFX12: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16 v255, null -// GFX12: v_fract_f16_e64 v255, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16 v255.h, m0 +// GFX12: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16 v255, s1 -// GFX12: v_fract_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16 v255.h, null +// GFX12: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16 v255, s105 -// GFX12: v_fract_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16 v255.h, s1 +// GFX12: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16 v255, src_scc -// GFX12: v_fract_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] +v_fract_f16 v255.h, s105 +// GFX12: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16 v255, ttmp15 -// GFX12: v_fract_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16 v255.h, src_scc +// GFX12: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00] -v_fract_f16 v255, v1 -// GFX12: v_fract_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16 v255.h, ttmp15 +// GFX12: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16 v255.h, v1.h +// GFX12: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16 v255, v127 -// GFX12: v_fract_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] +v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_fract_f16 v255.h, v127.h +// GFX12: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00] -v_fract_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_fract_f16 v255, vcc_hi -// GFX12: v_fract_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_fract_f16 v255, vcc_lo -// GFX12: v_fract_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16 v255.h, vcc_hi +// GFX12: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16 v5, v199 -// GFX12: v_fract_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] +v_fract_f16 v255.h, vcc_lo +// GFX12: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_fract_f16 v255.l, -1 +// GFX12: v_fract_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_fract_f16 v255.l, 0.5 +// GFX12: v_fract_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_hi +// GFX12: v_fract_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f16 v255.l, exec_lo +// GFX12: v_fract_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f16 v255.l, m0 +// GFX12: v_fract_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f16 v255.l, null +// GFX12: v_fract_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f16 v255.l, s1 +// GFX12: v_fract_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f16 v255.l, s105 +// GFX12: v_fract_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f16 v255.l, src_scc +// GFX12: v_fract_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00] + +v_fract_f16 v255.l, ttmp15 +// GFX12: v_fract_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f16 v255.l, v1.l +// GFX12: v_fract_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_fract_f16 v255.l, v127.l +// GFX12: v_fract_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00] + +v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_fract_f16 v255.l, vcc_hi +// GFX12: v_fract_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f16 v255.l, vcc_lo +// GFX12: v_fract_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f16 v5.h, v199.h +// GFX12: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_fract_f16 v5.l, v199.l +// GFX12: v_fract_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00] + +v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_frexp_exp_i16_f16 v128.h, 0xfe0b // GFX12: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e4f62eadc0e49..613a70f46800e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -2236,50 +2236,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 // GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_fract_f16_e64 v5, v1 -// GFX12: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v1.l +// GFX12: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] -v_fract_f16_e64 v5, v255 -// GFX12: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +v_fract_f16_e64 v5.l, v255.l +// GFX12: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] -v_fract_f16_e64 v5, s1 -// GFX12: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s1 +// GFX12: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] -v_fract_f16_e64 v5, s105 -// GFX12: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, s105 +// GFX12: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_lo -// GFX12: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_lo +// GFX12: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] -v_fract_f16_e64 v5, vcc_hi -// GFX12: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, vcc_hi +// GFX12: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] -v_fract_f16_e64 v5, ttmp15 -// GFX12: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, ttmp15 +// GFX12: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] -v_fract_f16_e64 v5, m0 -// GFX12: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, m0 +// GFX12: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_lo -// GFX12: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_lo +// GFX12: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] -v_fract_f16_e64 v5, exec_hi -// GFX12: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, exec_hi +// GFX12: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] -v_fract_f16_e64 v5, null -// GFX12: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, null +// GFX12: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] -v_fract_f16_e64 v5, -1 -// GFX12: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +v_fract_f16_e64 v5.l, -1 +// GFX12: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] -v_fract_f16_e64 v5, 0.5 mul:2 -// GFX12: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +v_fract_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] -v_fract_f16_e64 v5, src_scc mul:4 -// GFX12: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +v_fract_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] -v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_fract_f16_e64 v5.h, v1.h +// GFX12: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16_e64 v5.l, v255.h +// GFX12: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_fract_f32_e64 v5, v1 // GFX12: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index fb57e5cd54ab8..2044058566052 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1699,47 +1699,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_fract_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index acb73d8dbaf73..b0283c2df7169 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -502,17 +502,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 55b2081c04917..67b39ee2fdea0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -2017,49 +2017,82 @@ # GFX11: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xbf,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e] 0x7f,0xbf,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e] 0x01,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e] 0x69,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e] 0x6a,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e] 0x6b,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e] 0x7b,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e] 0x7d,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e] 0x7e,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e] 0x7f,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e] 0x7c,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e] 0xc1,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e] 0xf0,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e] 0xfd,0xbe,0x0a,0x7e -# GFX11: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +# GFX11-REAL16: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e] 0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xbf,0x0a,0x7e +# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbf,0x0a,0x7e] + +0xff,0xbf,0x0a,0x7e +# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbf,0x0a,0x7e] + +0xf0,0xbe,0xfe,0x7e +# GFX11-REAL16: v_fract_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xbe,0xfe,0x7e] +# GFX11-FAKE16: v_fract_f16_e32 v127, 0.5 ; encoding: [0xf0,0xbe,0xfe,0x7e] + +0xfd,0xbe,0x0a,0x7f +# GFX11-REAL16: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f] + +0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x41,0x0a,0x7e # GFX11: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index d2e1e926cc19e..55a128f386b7c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -1545,46 +1545,72 @@ # GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 93fb5e2b4c01a..0a4d263862407 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -302,10 +302,23 @@ # GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 74d875081d113..4e64ecaa85ecc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1659,46 +1659,72 @@ # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index a4bdfe9f4a975..1d9edc8535d60 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -455,16 +455,32 @@ # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 7c4f1634026fd..c3889208779f8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -2037,49 +2037,76 @@ # GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 24dc882e8beb0..b9e8c46a084f4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1649,46 +1649,68 @@ # GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 2eeb220b913fd..66cdd104850fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -315,10 +315,19 @@ # GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 661d072f46c1a..6c98db919a9d7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -2089,50 +2089,78 @@ # GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +# GFX11: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index a1e431bc49d34..829e21f9b4b99 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1665,47 +1665,74 @@ # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 405b716c110e1..c22c8745d86c2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -473,17 +473,34 @@ # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From b71a6fd042173098977e97a47ee0bedb4040069a Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 15:46:06 -0500 Subject: [PATCH 123/480] [AMDGPU][True16][MC] true16 for v_cvt_i32_i16 (#120645) Support true16 format for v_cvt_i32_i16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 14 ++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 +++++++++++-------- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++++-- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 21 ++++-- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 59 +++++++++-------- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 15 +++-- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 11 ++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 14 ++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 ++++++++++-------- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 18 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 21 ++++-- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 11 ++-- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 59 +++++++++-------- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 15 +++-- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 15 ++++- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 +++++++++++---- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 18 ++++- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 46 +++++++++---- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 10 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 10 ++- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++++++++---- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 14 +++- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 10 ++- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 46 +++++++++---- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 10 ++- 28 files changed, 492 insertions(+), 235 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b58b7a5fcdcd0..e1d97bd9f6399 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1019,7 +1019,7 @@ defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>; defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069, "v_not_b16">; -defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; +defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050, "v_cvt_f16_u16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 0d29ed985269a..ace776f789eba 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -1271,11 +1271,11 @@ v_cvt_i32_f64 v5, src_scc v_cvt_i32_f64 v255, 0xaf123456 // GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16 v5, v1 -// GFX11: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v1.l +// GFX11: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e] -v_cvt_i32_i16 v5, v127 -// GFX11: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v127.l +// GFX11: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e] v_cvt_i32_i16 v5, s1 // GFX11: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e] @@ -1316,6 +1316,12 @@ v_cvt_i32_i16 v5, src_scc v_cvt_i32_i16 v255, 0xfe0b // GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16 v5, v1.h +// GFX11: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e] + +v_cvt_i32_i16 v5, v127.h +// GFX11: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e] + v_cvt_nearest_i32_f32 v5, v1 // GFX11: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index d4fb880f25b55..93c120ac59477 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -926,47 +926,56 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16 v5, v1 row_mirror -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_mirror +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_half_mirror -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_half_mirror +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:15 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:15 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:15 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:15 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:15 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:15 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] +v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] + +v_cvt_i32_i16 v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01] + +v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13] + +v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index b6094c5ea3bd6..2029baee77df9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -218,14 +218,23 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +v_cvt_i32_i16 v5, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05] + +v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 98db7cc8bbc40..936cce46f2ebc 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -236,6 +236,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction +v_cvt_i32_i16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 9de05d4a82465..1c8d7e43be081 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -662,14 +662,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX11: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v199 -// GFX11: v_cvt_i32_i16_e64 v5, v199 ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] +v_cvt_i32_i16 v5, v199.h +// GFX11: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00] -v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cvt_i32_i16 v5, v199.l +// GFX11: v_cvt_i32_i16_e64 v5, v199.l ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] + +v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_norm_i16_f16 v128.h, 0xfe0b // GFX11: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index b674395fddf63..204d87c280525 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -967,47 +967,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_mirror -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index a07db726574e5..d779b65bc0ba9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -283,14 +283,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 964a19205df5c..7abc0185d6af6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -1267,11 +1267,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc| v_cvt_i32_f64_e64 v255, 0xaf123456 clamp // GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16_e64 v5, v1 -// GFX11: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v1.l +// GFX11: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] -v_cvt_i32_i16_e64 v5, v255 -// GFX11: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.l +// GFX11: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] v_cvt_i32_i16_e64 v5, s1 // GFX11: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1312,6 +1312,9 @@ v_cvt_i32_i16_e64 v5, src_scc v_cvt_i32_i16_e64 v255, 0xfe0b // GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.h +// GFX11: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] + v_cvt_nearest_i32_f32_e64 v5, v1 // GFX11: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 42b9dc464dd90..8f517ecdfc84a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -1338,11 +1338,11 @@ v_cvt_i32_f64 v5, src_scc v_cvt_i32_f64 v255, 0xaf123456 // GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16 v5, v1 -// GFX12: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v1.l +// GFX12: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e] -v_cvt_i32_i16 v5, v127 -// GFX12: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] +v_cvt_i32_i16 v5, v127.l +// GFX12: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e] v_cvt_i32_i16 v5, s1 // GFX12: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e] @@ -1384,6 +1384,12 @@ v_cvt_i32_i16 v5, src_scc v_cvt_i32_i16 v255, 0xfe0b // GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16 v5, v1.h +// GFX12: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e] + +v_cvt_i32_i16 v5, v127.h +// GFX12: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e] + v_cvt_nearest_i32_f32 v5, v1 // GFX12: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 7cfc2c1d45285..914cfcbb229a3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -970,47 +970,53 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30] -v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16 v5, v1 row_mirror -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_mirror +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_half_mirror -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_half_mirror +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shl:15 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shl:15 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_shr:15 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_shr:15 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_ror:15 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_ror:15 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] +v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30] + +v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13] + +v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index ddb4029f10208..f1c4e863b1873 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -244,14 +244,20 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index 05a990eed89c8..eb7b86635f35d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -251,6 +251,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction +v_cvt_i32_i16_e32 v5, v199.h +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.l +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + +v_cvt_i32_i16_e32 v5, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction + v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 96de27842c072..2f0c0a1192f2f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -622,14 +622,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0] // GFX12: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cvt_i32_i16 v5, v199 -// GFX12: v_cvt_i32_i16_e64 v5, v199 ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] +v_cvt_i32_i16 v5, v199.h +// GFX12: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00] -v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] -v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cvt_i32_i16 v5, v199.l +// GFX12: v_cvt_i32_i16_e64 v5, v199.l ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00] + +v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_norm_i16_f16 v128.h, 0xfe0b // GFX12: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 613a70f46800e..224f7f090a64f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -1417,11 +1417,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc| v_cvt_i32_f64_e64 v255, 0xaf123456 clamp // GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cvt_i32_i16_e64 v5, v1 -// GFX12: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v1.l +// GFX12: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] -v_cvt_i32_i16_e64 v5, v255 -// GFX12: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.l +// GFX12: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] v_cvt_i32_i16_e64 v5, s1 // GFX12: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1462,6 +1462,9 @@ v_cvt_i32_i16_e64 v5, src_scc v_cvt_i32_i16_e64 v255, 0xfe0b // GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cvt_i32_i16_e64 v5, v255.h +// GFX12: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] + v_cvt_nearest_i32_f32_e64 v5, v1 // GFX12: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 2044058566052..0a8ce42e130c3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1000,47 +1000,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_mirror -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] -v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] +v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13] -v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index b0283c2df7169..930f8f8d56957 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -316,14 +316,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index 67b39ee2fdea0..cc3b8fdd9093b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -1300,10 +1300,12 @@ # GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xd5,0x0a,0x7e -# GFX11: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e] 0x7f,0xd5,0x0a,0x7e -# GFX11: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e] 0x01,0xd4,0x0a,0x7e # GFX11: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e] @@ -1344,6 +1346,15 @@ 0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00 # GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00] +0x81,0xd5,0x0a,0x7e +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd5,0x0a,0x7e] + +0xff,0xd5,0x0a,0x7e +# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e] +# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd5,0x0a,0x7e] + + 0x01,0x19,0x0a,0x7e # GFX11: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index 55a128f386b7c..ba9e8142942de 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -971,46 +971,72 @@ # GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30 -# GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] + +0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index 0a4d263862407..dda9dfcb35b1a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -185,10 +185,24 @@ # GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00 -# GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 4e64ecaa85ecc..0191f37c14e31 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -1025,46 +1025,64 @@ # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index 1d9edc8535d60..ab3788deeed3d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -291,10 +291,16 @@ # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index c3889208779f8..2e741322eb122 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -1314,10 +1314,12 @@ # GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00 # GFX11: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1358,6 +1360,10 @@ 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index b9e8c46a084f4..4d6e8ffbd9a27 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1035,46 +1035,68 @@ # GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30] 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30 -# GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30] + +0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13] + +0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 66cdd104850fd..fcc1d3f97dcb1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -198,10 +198,20 @@ # GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00 -# GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00] + +0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05] + +0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 6c98db919a9d7..dad6b502e0bd0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -1366,10 +1366,12 @@ # GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00 # GFX12: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] @@ -1410,6 +1412,10 @@ 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index 829e21f9b4b99..ccf5f4b21b73c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -1055,46 +1055,64 @@ # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01] 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13] 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] + +0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index c22c8745d86c2..8018f80798573 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -321,10 +321,16 @@ # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00] 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] + +0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From bf274b3d8044cab8478bef50ccf96313e4dbf21e Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 3 Jan 2025 15:46:41 -0500 Subject: [PATCH 124/480] [AMDGPU][True16][MC] true16 for v_cos_f16 (#120639) Support true16 format for v_cos_f16 in MC --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 32 ++++ llvm/test/MC/AMDGPU/gfx11_asm_vop1.s | 75 +++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s | 65 ++++---- .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s | 25 ++- .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s | 69 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 72 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 62 +++---- llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 18 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s | 42 +++++ .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s | 154 +++++++++++++----- .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 69 ++++---- .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 65 ++++---- .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 25 ++- .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 63 +++++-- .../AMDGPU/gfx11_dasm_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop1_dpp8.txt | 17 +- .../gfx11_dasm_vop3_dpp16_from_vop1.txt | 54 ++++-- .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt | 57 +++++-- .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 50 ++++-- .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 13 +- .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 57 +++++-- .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 54 ++++-- .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 24 ++- 29 files changed, 1062 insertions(+), 462 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e1d97bd9f6399..fc22b539d7153 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1046,7 +1046,7 @@ defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">; defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">; -defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; +defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061, "v_cos_f16">; defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 3ff759a5cdb94..867025adca944 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: @@ -80,6 +81,19 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_cos_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: cos_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.cos.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -188,6 +202,24 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: cos_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_cos_f16_e32 v2, v2 +; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index ace776f789eba..40a6e434b438d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -269,50 +269,65 @@ v_clz_i32_u32 v5, src_scc v_clz_i32_u32 v255, 0xaf123456 // GFX11: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cos_f16 v5, v1 -// GFX11: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v1.l +// GFX11: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e] -v_cos_f16 v5, v127 -// GFX11: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v127.l +// GFX11: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e] -v_cos_f16 v5, s1 -// GFX11: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s1 +// GFX11: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] -v_cos_f16 v5, s105 -// GFX11: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s105 +// GFX11: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_lo -// GFX11: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_lo +// GFX11: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_hi -// GFX11: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_hi +// GFX11: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] -v_cos_f16 v5, ttmp15 -// GFX11: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, ttmp15 +// GFX11: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] -v_cos_f16 v5, m0 -// GFX11: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, m0 +// GFX11: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_lo -// GFX11: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_lo +// GFX11: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_hi -// GFX11: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_hi +// GFX11: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] -v_cos_f16 v5, null -// GFX11: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, null +// GFX11: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e] -v_cos_f16 v5, -1 -// GFX11: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, -1 +// GFX11: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] -v_cos_f16 v5, 0.5 -// GFX11: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, 0.5 +// GFX11: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] -v_cos_f16 v5, src_scc -// GFX11: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, src_scc +// GFX11: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] -v_cos_f16 v127, 0xfe0b -// GFX11: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_cos_f16 v127.l, 0xfe0b +// GFX11: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_cos_f16 v5.l, v1.h +// GFX11: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e] + +v_cos_f16 v5.l, v127.h +// GFX11: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e] + +v_cos_f16 v127.l, 0.5 +// GFX11: v_cos_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc2,0xfe,0x7e] + +v_cos_f16 v5.h, src_scc +// GFX11: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f] + +v_cos_f16 v127.h, 0xfe0b +// GFX11: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_cos_f32 v5, v1 // GFX11: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 93c120ac59477..706cb6e32f88a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -212,47 +212,56 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_cos_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cos_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cos_f16 v5, v1 row_mirror -// GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cos_f16 v5.l, v1.l row_mirror +// GFX11: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cos_f16 v5, v1 row_half_mirror -// GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cos_f16 v5.l, v1.l row_half_mirror +// GFX11: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cos_f16 v5, v1 row_shl:1 -// GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cos_f16 v5, v1 row_shl:15 -// GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:15 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cos_f16 v5, v1 row_shr:1 -// GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cos_f16 v5, v1 row_shr:15 -// GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:15 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cos_f16 v5, v1 row_ror:1 -// GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cos_f16 v5, v1 row_ror:15 -// GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:15 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_cos_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_cos_f32 v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 2029baee77df9..d7051aff42d77 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -50,14 +50,23 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_cos_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 936cce46f2ebc..263ad4bf513a1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -47,6 +47,12 @@ v_ceil_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] v_cos_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cos_f16_e32 v128.h, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v128.l, 0xfe0b +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v255, v1 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -56,6 +62,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v255.h, v1.h +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v5, v199 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -65,6 +89,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v5.h, v199.h +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction + v_cvt_f16_f32_e32 v128, 0xaf123456 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 1c8d7e43be081..42c36538f2bf6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -68,71 +68,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_ceil_f16 v5, v199 quad_perm:[3,2,1,0] // GFX11: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cos_f16 v128, 0xfe0b -// GFX11: v_cos_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cos_f16 v128.h, 0xfe0b +// GFX11: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, -1 -// GFX11: v_cos_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16 v128.l, 0xfe0b +// GFX11: v_cos_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, 0.5 -// GFX11: v_cos_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] +v_cos_f16 v255.h, -1 +// GFX11: v_cos_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v255, exec_hi -// GFX11: v_cos_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16 v255.h, 0.5 +// GFX11: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00] -v_cos_f16 v255, exec_lo -// GFX11: v_cos_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_hi +// GFX11: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16 v255, m0 -// GFX11: v_cos_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_lo +// GFX11: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16 v255, null -// GFX11: v_cos_f16_e64 v255, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16 v255.h, m0 +// GFX11: v_cos_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16 v255, s1 -// GFX11: v_cos_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16 v255.h, null +// GFX11: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16 v255, s105 -// GFX11: v_cos_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16 v255.h, s1 +// GFX11: v_cos_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16 v255, src_scc -// GFX11: v_cos_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] +v_cos_f16 v255.h, s105 +// GFX11: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16 v255, ttmp15 -// GFX11: v_cos_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16 v255.h, src_scc +// GFX11: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00] -v_cos_f16 v255, v1 -// GFX11: v_cos_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16 v255.h, ttmp15 +// GFX11: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16 v255.h, v1.h +// GFX11: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16 v255, v127 -// GFX11: v_cos_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] +v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_cos_f16 v255.h, v127.h +// GFX11: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00] -v_cos_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_cos_f16 v255, vcc_hi -// GFX11: v_cos_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_cos_f16 v255, vcc_lo -// GFX11: v_cos_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16 v255.h, vcc_hi +// GFX11: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16 v5, v199 -// GFX11: v_cos_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] +v_cos_f16 v255.h, vcc_lo +// GFX11: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cos_f16 v255.l, -1 +// GFX11: v_cos_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cos_f16 v255.l, 0.5 +// GFX11: v_cos_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_hi +// GFX11: v_cos_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_lo +// GFX11: v_cos_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f16 v255.l, m0 +// GFX11: v_cos_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f16 v255.l, null +// GFX11: v_cos_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f16 v255.l, s1 +// GFX11: v_cos_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f16 v255.l, s105 +// GFX11: v_cos_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f16 v255.l, src_scc +// GFX11: v_cos_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] + +v_cos_f16 v255.l, ttmp15 +// GFX11: v_cos_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f16 v255.l, v1.l +// GFX11: v_cos_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cos_f16 v255.l, v127.l +// GFX11: v_cos_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] + +v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_cos_f16 v255.l, vcc_hi +// GFX11: v_cos_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f16 v255.l, vcc_lo +// GFX11: v_cos_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f16 v5.h, v199.h +// GFX11: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cos_f16 v5.l, v199.l +// GFX11: v_cos_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_f16_f32 v128.h, 0xaf123456 // GFX11: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 204d87c280525..874fb5bffa0ad 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -220,47 +220,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 row_mirror -// GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_mirror +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_half_mirror -// GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:15 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:15 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:15 -// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index d779b65bc0ba9..8e6783e0f413c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -61,17 +61,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 7abc0185d6af6..3f9af472a6372 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc v_clz_i32_u32_e64 v255, 0xaf123456 // GFX11: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cos_f16_e64 v5, v1 -// GFX11: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v1.l +// GFX11: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16_e64 v5, v255 -// GFX11: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v255.l +// GFX11: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] -v_cos_f16_e64 v5, s1 -// GFX11: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s1 +// GFX11: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16_e64 v5, s105 -// GFX11: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s105 +// GFX11: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_lo -// GFX11: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_lo +// GFX11: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_hi -// GFX11: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_hi +// GFX11: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16_e64 v5, ttmp15 -// GFX11: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, ttmp15 +// GFX11: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16_e64 v5, m0 -// GFX11: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, m0 +// GFX11: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_lo -// GFX11: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_lo +// GFX11: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_hi -// GFX11: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_hi +// GFX11: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16_e64 v5, null -// GFX11: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, null +// GFX11: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16_e64 v5, -1 -// GFX11: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, -1 +// GFX11: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16_e64 v5, 0.5 mul:2 -// GFX11: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +v_cos_f16_e64 v5.l, 0.5 mul:2 +// GFX11: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] -v_cos_f16_e64 v5, src_scc mul:4 -// GFX11: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +v_cos_f16_e64 v5.l, src_scc mul:4 +// GFX11: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] -v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX11: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cos_f16_e64 v5.h, v1.h +// GFX11: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16_e64 v5.l, v255.h +// GFX11: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX11: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_cos_f32_e64 v5, v1 // GFX11: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 8f517ecdfc84a..6c69f3fb78bc0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -265,50 +265,62 @@ v_clz_i32_u32 v5, src_scc v_clz_i32_u32 v255, 0xaf123456 // GFX12: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_cos_f16 v5, v1 -// GFX12: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v1.l +// GFX12: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e] -v_cos_f16 v5, v127 -// GFX12: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] +v_cos_f16 v5.l, v127.l +// GFX12: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e] -v_cos_f16 v5, s1 -// GFX12: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s1 +// GFX12: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] -v_cos_f16 v5, s105 -// GFX12: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, s105 +// GFX12: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_lo -// GFX12: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_lo +// GFX12: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] -v_cos_f16 v5, vcc_hi -// GFX12: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, vcc_hi +// GFX12: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] -v_cos_f16 v5, ttmp15 -// GFX12: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, ttmp15 +// GFX12: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] -v_cos_f16 v5, m0 -// GFX12: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, m0 +// GFX12: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_lo -// GFX12: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_lo +// GFX12: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] -v_cos_f16 v5, exec_hi -// GFX12: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, exec_hi +// GFX12: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] -v_cos_f16 v5, null -// GFX12: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, null +// GFX12: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e] -v_cos_f16 v5, -1 -// GFX12: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, -1 +// GFX12: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] -v_cos_f16 v5, 0.5 -// GFX12: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, 0.5 +// GFX12: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] -v_cos_f16 v5, src_scc -// GFX12: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +v_cos_f16 v5.l, src_scc +// GFX12: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] -v_cos_f16 v127, 0xfe0b -// GFX12: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_cos_f16 v127.l, 0xfe0b +// GFX12: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +v_cos_f16 v5.l, v1.h +// GFX12: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e] + +v_cos_f16 v5.l, v127.h +// GFX12: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e] + +v_cos_f16 v5.h, src_scc +// GFX12: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f] + +v_cos_f16 v127.h, 0xfe0b +// GFX12: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] v_cos_f32 v5, v1 // GFX12: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 914cfcbb229a3..05a5f8bd44b9c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -214,47 +214,53 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30] -v_cos_f16 v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] -v_cos_f16 v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] -v_cos_f16 v5, v1 row_mirror -// GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +v_cos_f16 v5.l, v1.l row_mirror +// GFX12: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] -v_cos_f16 v5, v1 row_half_mirror -// GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +v_cos_f16 v5.l, v1.l row_half_mirror +// GFX12: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] -v_cos_f16 v5, v1 row_shl:1 -// GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] -v_cos_f16 v5, v1 row_shl:15 -// GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shl:15 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] -v_cos_f16 v5, v1 row_shr:1 -// GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] -v_cos_f16 v5, v1 row_shr:15 -// GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_shr:15 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] -v_cos_f16 v5, v1 row_ror:1 -// GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] -v_cos_f16 v5, v1 row_ror:15 -// GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +v_cos_f16 v5.l, v1.l row_ror:15 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] -v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] -v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] -v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] +v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13] -v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13] + +v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30] v_cos_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index f1c4e863b1873..bf03e7f8e518c 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -49,14 +49,20 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] -v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] + +v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s index eb7b86635f35d..f584b69c33ec8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s @@ -26,6 +26,12 @@ v_ceil_f16_e32 v5, v199 quad_perm:[3,2,1,0] v_cos_f16_e32 v128, 0xfe0b // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cos_f16_e32 v128.h, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v128.l, 0xfe0b +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v255, v1 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -35,6 +41,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v255.h, v1.h +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + +v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction + v_cos_f16_e32 v5, v199 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode @@ -44,6 +68,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction +v_cos_f16_e32 v5.h, v199.h +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + +v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction + v_cvt_f16_f32_e32 v128.h, 0xaf123456 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s index 2f0c0a1192f2f..27e92b7e4f22b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s @@ -67,71 +67,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] v_ceil_f16 v5, v199 quad_perm:[3,2,1,0] // GFX12: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] -v_cos_f16 v128, 0xfe0b -// GFX12: v_cos_f16_e64 v128, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +v_cos_f16 v128.h, 0xfe0b +// GFX12: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, -1 -// GFX12: v_cos_f16_e64 v255, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16 v128.l, 0xfe0b +// GFX12: v_cos_f16_e64 v128.l, 0xfe0b ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_cos_f16 v255, 0.5 -// GFX12: v_cos_f16_e64 v255, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] +v_cos_f16 v255.h, -1 +// GFX12: v_cos_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v255, exec_hi -// GFX12: v_cos_f16_e64 v255, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16 v255.h, 0.5 +// GFX12: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00] -v_cos_f16 v255, exec_lo -// GFX12: v_cos_f16_e64 v255, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_hi +// GFX12: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16 v255, m0 -// GFX12: v_cos_f16_e64 v255, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16 v255.h, exec_lo +// GFX12: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16 v255, null -// GFX12: v_cos_f16_e64 v255, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16 v255.h, m0 +// GFX12: v_cos_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16 v255, s1 -// GFX12: v_cos_f16_e64 v255, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16 v255.h, null +// GFX12: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16 v255, s105 -// GFX12: v_cos_f16_e64 v255, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16 v255.h, s1 +// GFX12: v_cos_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16 v255, src_scc -// GFX12: v_cos_f16_e64 v255, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] +v_cos_f16 v255.h, s105 +// GFX12: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16 v255, ttmp15 -// GFX12: v_cos_f16_e64 v255, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16 v255.h, src_scc +// GFX12: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00] -v_cos_f16 v255, v1 -// GFX12: v_cos_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16 v255.h, ttmp15 +// GFX12: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16 v255.h, v1.h +// GFX12: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16 v255, v1 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16 v255, v127 -// GFX12: v_cos_f16_e64 v255, v127 ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] +v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] +v_cos_f16 v255.h, v127.h +// GFX12: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00] -v_cos_f16 v255, v127 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] +v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] -v_cos_f16 v255, vcc_hi -// GFX12: v_cos_f16_e64 v255, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] -v_cos_f16 v255, vcc_lo -// GFX12: v_cos_f16_e64 v255, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16 v255.h, vcc_hi +// GFX12: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16 v5, v199 -// GFX12: v_cos_f16_e64 v5, v199 ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] +v_cos_f16 v255.h, vcc_lo +// GFX12: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] +v_cos_f16 v255.l, -1 +// GFX12: v_cos_f16_e64 v255.l, -1 ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16 v5, v199 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] +v_cos_f16 v255.l, 0.5 +// GFX12: v_cos_f16_e64 v255.l, 0.5 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_hi +// GFX12: v_cos_f16_e64 v255.l, exec_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f16 v255.l, exec_lo +// GFX12: v_cos_f16_e64 v255.l, exec_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f16 v255.l, m0 +// GFX12: v_cos_f16_e64 v255.l, m0 ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f16 v255.l, null +// GFX12: v_cos_f16_e64 v255.l, null ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f16 v255.l, s1 +// GFX12: v_cos_f16_e64 v255.l, s1 ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f16 v255.l, s105 +// GFX12: v_cos_f16_e64 v255.l, s105 ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f16 v255.l, src_scc +// GFX12: v_cos_f16_e64 v255.l, src_scc ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00] + +v_cos_f16 v255.l, ttmp15 +// GFX12: v_cos_f16_e64 v255.l, ttmp15 ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f16 v255.l, v1.l +// GFX12: v_cos_f16_e64 v255.l, v1.l ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cos_f16 v255.l, v127.l +// GFX12: v_cos_f16_e64 v255.l, v127.l ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00] + +v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05] + +v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff] + +v_cos_f16 v255.l, vcc_hi +// GFX12: v_cos_f16_e64 v255.l, vcc_hi ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f16 v255.l, vcc_lo +// GFX12: v_cos_f16_e64 v255.l, vcc_lo ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f16 v5.h, v199.h +// GFX12: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] + +v_cos_f16 v5.l, v199.l +// GFX12: v_cos_f16_e64 v5.l, v199.l ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00] + +v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05] + +v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff] v_cvt_f16_f32 v128.h, 0xaf123456 // GFX12: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 224f7f090a64f..0ba9874b1a22e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc v_clz_i32_u32_e64 v255, 0xaf123456 // GFX12: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cos_f16_e64 v5, v1 -// GFX12: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v1.l +// GFX12: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] -v_cos_f16_e64 v5, v255 -// GFX12: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +v_cos_f16_e64 v5.l, v255.l +// GFX12: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] -v_cos_f16_e64 v5, s1 -// GFX12: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s1 +// GFX12: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] -v_cos_f16_e64 v5, s105 -// GFX12: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, s105 +// GFX12: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_lo -// GFX12: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_lo +// GFX12: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] -v_cos_f16_e64 v5, vcc_hi -// GFX12: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, vcc_hi +// GFX12: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] -v_cos_f16_e64 v5, ttmp15 -// GFX12: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, ttmp15 +// GFX12: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] -v_cos_f16_e64 v5, m0 -// GFX12: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, m0 +// GFX12: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_lo -// GFX12: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_lo +// GFX12: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] -v_cos_f16_e64 v5, exec_hi -// GFX12: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, exec_hi +// GFX12: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] -v_cos_f16_e64 v5, null -// GFX12: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, null +// GFX12: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] -v_cos_f16_e64 v5, -1 -// GFX12: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +v_cos_f16_e64 v5.l, -1 +// GFX12: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] -v_cos_f16_e64 v5, 0.5 mul:2 -// GFX12: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +v_cos_f16_e64 v5.l, 0.5 mul:2 +// GFX12: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] -v_cos_f16_e64 v5, src_scc mul:4 -// GFX12: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +v_cos_f16_e64 v5.l, src_scc mul:4 +// GFX12: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] -v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 -// GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 +// GFX12: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cos_f16_e64 v5.h, v1.h +// GFX12: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16_e64 v5.l, v255.h +// GFX12: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2 +// GFX12: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] v_cos_f32_e64 v5, v1 // GFX12: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 0a8ce42e130c3..197f02719905d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -211,47 +211,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] -v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] -// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -v_cos_f16_e64_dpp v5, v1 row_mirror -// GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_mirror +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_half_mirror -// GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shl:15 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_shr:15 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_ror:15 -// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index 930f8f8d56957..0dfc47b4e4020 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -52,17 +52,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] -v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt index cc3b8fdd9093b..8d86bafca059f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt @@ -287,49 +287,82 @@ # GFX11: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0x01,0xc3,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e] 0x7f,0xc3,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e] 0x01,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e] 0x69,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e] 0x6a,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e] 0x6b,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e] 0x7b,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e] 0x7d,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e] 0x7e,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e] 0x7f,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e] 0x7c,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e] 0xc1,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e] 0xf0,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e] 0xfd,0xc2,0x0a,0x7e -# GFX11: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +# GFX11-REAL16: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e] 0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00 -# GFX11: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00] + +0x81,0xc3,0x0a,0x7e +# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc3,0x0a,0x7e] + +0xff,0xc3,0x0a,0x7e +# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc3,0x0a,0x7e] + +0xf0,0xc2,0xfe,0x7e +# GFX11-REAL16: v_cos_f16_e32 v127.l, 0.5 ; encoding: [0xf0,0xc2,0xfe,0x7e] +# GFX11-FAKE16: v_cos_f16_e32 v127, 0.5 ; encoding: [0xf0,0xc2,0xfe,0x7e] + +0xfd,0xc2,0x0a,0x7f +# GFX11-REAL16: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f] + +0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00] 0x01,0x6d,0x0a,0x7e # GFX11: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt index ba9e8142942de..f01ce5a31be3c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt @@ -229,46 +229,72 @@ # GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01] + +0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX11-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt index dda9dfcb35b1a..0f102c52a3666 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt @@ -43,10 +43,23 @@ # GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05] + +0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187 ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt index 0191f37c14e31..6e5239522df41 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt @@ -239,46 +239,72 @@ # GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt index ab3788deeed3d..d7f9e86d3ca00 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt @@ -61,16 +61,32 @@ # GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 2e741322eb122..db5ba967d709b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -281,49 +281,76 @@ # GFX11: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00 -# GFX11: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08 -# GFX11: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX11-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10 -# GFX11: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX11-FAKE16: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00 # GFX11: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 4d6e8ffbd9a27..d37c9229f1666 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -237,46 +237,68 @@ # GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff -# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff -# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff -# GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff] 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01 -# GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01] 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13 -# GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13] 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30 -# GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30] + +0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176 ; encoding: [0x81,0x60,0x01,0x13] + +0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30 +# GFX12-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183 ; encoding: [0xff,0x6f,0x3d,0x30] 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff # GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index fcc1d3f97dcb1..f3dce5e6d5b93 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -44,10 +44,19 @@ # GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00 -# GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + +0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index dad6b502e0bd0..e4b619d87e400 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -279,49 +279,76 @@ # GFX12: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08 -# GFX12: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX12-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10 -# GFX12: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX12-FAKE16: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 -# GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00 # GFX12: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index ccf5f4b21b73c..b77cf5ab6efc1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -227,46 +227,72 @@ # GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 -# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] + +0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13] + +0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30] 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 8018f80798573..50339f51c5629 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -49,16 +49,32 @@ # GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00] 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From 9f5cefebb482331796ceaebbfcebcd5aee1eb339 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 3 Jan 2025 15:12:39 -0600 Subject: [PATCH 125/480] [mlir][Affine] Generalize the linearize(delinearize()) simplifications (#117637) The existing canonicalization patterns would only cancel out cases where the entire result list of an affine.delineraize_index was passed to an affine.lineraize_index and the basis elements matched exactly (except possibly for the outer bounds). This was correct, but limited, and left open many cases where a delinearize_index would take a series of divisions and modulos only for a subsequent linearize_index to use additions and multiplications to undo all that work. This sort of simplification is reasably easy to observe at the level of splititng and merging indexes, but difficult to perform once the underlying arithmetic operations have been created. Therefore, this commit generalizes the existing simplification logic. Now, any run of two or more delinearize_index results that appears within the argument list of a linearize_index operation with the same basis (or where they're both at the outermost position and so can be unbonded, or when `linearize_index disjoint` implies a bound not present on the `delinearize_index`) will be reduced to one signle delinearize_index output, whose basis element (that is, size or length) is equal to the product of the sizes that were simplified away. That is, we can now simplify %0:2 = affine.delinearize_index %n into (8, 8) : inde, index %1 = affine.linearize_index [%x, %0#0, %0#1, %y] by (3, 8, 8, 5) : index to the simpler %1 = affine.linearize_index [%x, %n, %y] by (3, 64, 5) : index This new pattern also works with dynamically-sized basis values. While I'm here, I fixed a bunch of typos in existing tests, and added a new getPaddedBasis() method to make processing potentially-underspecified basis elements simpler in some cases. --- .../mlir/Dialect/Affine/IR/AffineOps.td | 27 +- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 276 ++++++++++++++++-- mlir/test/Dialect/Affine/canonicalize.mlir | 245 +++++++++++++++- 3 files changed, 506 insertions(+), 42 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index f5ca24389065e..e2eab1fb2178e 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -1083,6 +1083,9 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { %indices_2 = affine.apply #map2()[%linear_index] ``` + In other words, `%0:3 = affine.delinearize_index %x into (B, C)` produces + `%0 = {%x / (B * C), (%x mod (B * C)) / C, %x mod C}`. + The basis may either contain `N` or `N-1` elements, where `N` is the number of results. If there are N basis elements, the first one will not be used during computations, but may be used during analysis and canonicalization to eliminate terms from @@ -1098,7 +1101,12 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { %0:3 = affine.delinearize_index %linear_index into (244, 244) : index, index ``` - Note that, due to the constraints of affine maps, all the basis elements must + Note that, for symmetry with `getPaddedBasis()`, if `hasOuterBound` is `true` + when one of the `OpFoldResult` builders is called but the first element of the + basis is `nullptr`, that first element is ignored and the builder proceeds as if + there was no outer bound. + + Due to the constraints of affine maps, all the basis elements must be strictly positive. A dynamic basis element being 0 or negative causes undefined behavior. }]; @@ -1136,6 +1144,11 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { /// Return a vector that contains the basis of the operation, removing /// the outer bound if one is present. SmallVector getEffectiveBasis(); + + /// Return the vector with one basis element per result of the operation. If + /// there is no outer bound specified, the leading entry of this result will be + /// nullptr. + SmallVector getPaddedBasis(); }]; let hasVerifier = 1; @@ -1160,6 +1173,9 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", sum(i = 0 to N-1) %idx_i * product(j = i + 1 to N-1) B_j ``` + In other words, `%0 = affine.linearize_index [%z, %y, %x] by (Z, Y, X)` + gives `%0 = %x + %y * X + %z * X * Y`, or `%0 = %x + X * (%y + Y * (%z))`. + The basis may either have `N` or `N-1` elements, where `N` is the number of inputs to linearize_index. If `N` inputs are provided, the first one is not used in computation, but may be used during analysis or canonicalization as a bound @@ -1168,6 +1184,10 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", If all `N` basis elements are provided, the linearize_index operation is said to "have an outer bound". + As a convenience, and for symmetry with `getPaddedBasis()`, ifg the first + element of a set of `OpFoldResult`s passed to the builders of this operation is + `nullptr`, that element is ignored. + If the `disjoint` property is present, this is an optimization hint that, for all `i`, `0 <= %idx_i < B_i` - that is, no index affects any other index, except that `%idx_0` may be negative to make the index as a whole negative. @@ -1224,6 +1244,11 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", /// Return a vector that contains the basis of the operation, removing /// the outer bound if one is present. SmallVector getEffectiveBasis(); + + /// Return the vector with one basis element per index operand of the operation. + /// If there is no outer bound specified, the leading entry of this basis will be + /// nullptr. + SmallVector getPaddedBasis(); }]; let hasVerifier = 1; diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index dceebbfec586c..b45829bcf6d2c 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -4520,6 +4520,10 @@ void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, Value linearIndex, ValueRange basis, bool hasOuterBound) { + if (hasOuterBound && !basis.empty() && basis.front() == nullptr) { + hasOuterBound = false; + basis = basis.drop_front(); + } SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis, @@ -4533,6 +4537,10 @@ void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, Value linearIndex, ArrayRef basis, bool hasOuterBound) { + if (hasOuterBound && !basis.empty() && basis.front() == OpFoldResult()) { + hasOuterBound = false; + basis = basis.drop_front(); + } SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis); @@ -4654,6 +4662,13 @@ SmallVector AffineDelinearizeIndexOp::getEffectiveBasis() { return getMixedValues(getStaticBasis(), getDynamicBasis(), builder); } +SmallVector AffineDelinearizeIndexOp::getPaddedBasis() { + SmallVector ret = getMixedBasis(); + if (!hasOuterBound()) + ret.insert(ret.begin(), OpFoldResult()); + return ret; +} + namespace { // Drops delinearization indices that correspond to unit-extent basis @@ -4672,25 +4687,27 @@ struct DropUnitExtentBasis return zero.value(); }; - bool hasOuterBound = delinearizeOp.hasOuterBound(); // Replace all indices corresponding to unit-extent basis with 0. // Remaining basis can be used to get a new `affine.delinearize_index` op. SmallVector newBasis; - for (auto [index, basis] : llvm::enumerate(delinearizeOp.getMixedBasis())) { - std::optional basisVal = getConstantIntValue(basis); + for (auto [index, basis] : + llvm::enumerate(delinearizeOp.getPaddedBasis())) { + std::optional basisVal = + basis ? getConstantIntValue(basis) : std::nullopt; if (basisVal && *basisVal == 1) - replacements[index + (hasOuterBound ? 0 : 1)] = getZero(); + replacements[index] = getZero(); else newBasis.push_back(basis); } - if (newBasis.size() == delinearizeOp.getStaticBasis().size()) + if (newBasis.size() == delinearizeOp.getNumResults()) return rewriter.notifyMatchFailure(delinearizeOp, "no unit basis elements"); - if (!newBasis.empty() || !hasOuterBound) { + if (!newBasis.empty()) { + // Will drop the leading nullptr from `basis` if there was no outer bound. auto newDelinearizeOp = rewriter.create( - loc, delinearizeOp.getLinearIndex(), newBasis, hasOuterBound); + loc, delinearizeOp.getLinearIndex(), newBasis); int newIndex = 0; // Map back the new delinearized indices to the values they replace. for (auto &replacement : replacements) { @@ -4871,6 +4888,8 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, ValueRange multiIndex, ValueRange basis, bool disjoint) { + if (!basis.empty() && basis.front() == Value()) + basis = basis.drop_front(); SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis, @@ -4883,6 +4902,8 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder, ValueRange multiIndex, ArrayRef basis, bool disjoint) { + if (!basis.empty() && basis.front() == OpFoldResult()) + basis = basis.drop_front(); SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis); @@ -4965,7 +4986,14 @@ SmallVector AffineLinearizeIndexOp::getEffectiveBasis() { builder); } - return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder); + return getMixedValues(getStaticBasis(), getDynamicBasis(), builder); +} + +SmallVector AffineLinearizeIndexOp::getPaddedBasis() { + SmallVector ret = getMixedBasis(); + if (!hasOuterBound()) + ret.insert(ret.begin(), OpFoldResult()); + return ret; } namespace { @@ -5027,38 +5055,228 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final } }; -/// Cancel out linearize_index(delinearize_index(x, B), B). +/// Return the product of `terms`, creating an `affine.apply` if any of them are +/// non-constant values. If any of `terms` is `nullptr`, return `nullptr`. +static OpFoldResult computeProduct(Location loc, OpBuilder &builder, + ArrayRef terms) { + int64_t nDynamic = 0; + SmallVector dynamicPart; + AffineExpr result = builder.getAffineConstantExpr(1); + for (OpFoldResult term : terms) { + if (!term) + return term; + std::optional maybeConst = getConstantIntValue(term); + if (maybeConst) { + result = result * builder.getAffineConstantExpr(*maybeConst); + } else { + dynamicPart.push_back(term.get()); + result = result * builder.getAffineSymbolExpr(nDynamic++); + } + } + if (auto constant = dyn_cast(result)) + return getAsIndexOpFoldResult(builder.getContext(), constant.getValue()); + return builder.create(loc, result, dynamicPart).getResult(); +} + +/// If conseceutive outputs of a delinearize_index are linearized with the same +/// bounds, canonicalize away the redundant arithmetic. +/// +/// That is, if we have +/// ``` +/// %s:N = affine.delinearize_index %x into (...a, B1, B2, ... BK, ...b) +/// %t = affine.linearize_index [...c, %s#I, %s#(I + 1), ... %s#(I+K-1), ...d] +/// by (...e, B1, B2, ..., BK, ...f) +/// ``` /// -/// That is, rewrite +/// We can rewrite this to /// ``` -/// %0:N = affine.delinearize_index %x by (%b1, %b2, ... %bN) -/// %y = affine.linearize_index [%0#0, %0#1, ... %0#(N-1)] by (%b1, %b2, ... -/// %bN) +/// B = B1 * B2 ... BK +/// %sMerged:(N-K+1) affine.delinearize_index %x into (...a, B, ...b) +/// %t = affine.linearize_index [...c, %s#I, ...d] by (...e, B, ...f) /// ``` -/// to replacing `%y` with `%x`. -struct CancelLinearizeOfDelinearizeExact final +/// where we replace all results of %s unaffected by the change with results +/// from %sMerged. +/// +/// As a special case, if all results of the delinearize are merged in this way +/// we can replace those usages with %x, thus cancelling the delinearization +/// entirely, as in +/// ``` +/// %s:3 = affine.delinearize_index %x into (2, 4, 8) +/// %t = affine.linearize_index [%s#0, %s#1, %s#2, %c0] by (2, 4, 8, 16) +/// ``` +/// becoming `%t = affine.linearize_index [%x, %c0] by (64, 16)` +struct CancelLinearizeOfDelinearizePortion final : OpRewritePattern { using OpRewritePattern::OpRewritePattern; +private: + // Struct representing a case where the cancellation pattern + // applies. A `Match` means that `length` inputs to the linearize operation + // starting at `linStart` can be cancelled with `length` outputs of + // `delinearize`, starting from `delinStart`. + struct Match { + AffineDelinearizeIndexOp delinearize; + unsigned linStart = 0; + unsigned delinStart = 0; + unsigned length = 0; + }; + +public: LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp linearizeOp, PatternRewriter &rewriter) const override { - auto delinearizeOp = linearizeOp.getMultiIndex() - .front() - .getDefiningOp(); - if (!delinearizeOp) - return rewriter.notifyMatchFailure( - linearizeOp, "last entry doesn't come from a delinearize"); + SmallVector matches; + + const SmallVector linBasis = linearizeOp.getPaddedBasis(); + ArrayRef linBasisRef = linBasis; + + ValueRange multiIndex = linearizeOp.getMultiIndex(); + unsigned numLinArgs = multiIndex.size(); + unsigned linArgIdx = 0; + // We only want to replace one run from the same delinearize op per + // pattern invocation lest we run into invalidation issues. + llvm::SmallPtrSet alreadyMatchedDelinearize; + while (linArgIdx < numLinArgs) { + auto asResult = dyn_cast(multiIndex[linArgIdx]); + if (!asResult) { + linArgIdx++; + continue; + } - if (linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis()) - return rewriter.notifyMatchFailure( - linearizeOp, "basis of linearize and delinearize don't match exactly " - "(excluding outer bounds)"); + auto delinearizeOp = + dyn_cast(asResult.getOwner()); + if (!delinearizeOp) { + linArgIdx++; + continue; + } + + /// Result 0 of the delinearize and argument 0 of the linearize can + /// leave their maximum value unspecified. However, even if this happens + /// we can still sometimes start the match process. Specifically, if + /// - The argument we're matching is result 0 and argument 0 (so the + /// bounds don't matter). For example, + /// + /// %0:2 = affine.delinearize_index %x into (8) : index, index + /// %1 = affine.linearize_index [%s#0, %s#1, ...] (8, ...) + /// allows cancellation + /// - The delinearization doesn't specify a bound, but the linearization + /// is `disjoint`, which asserts that the bound on the linearization is + /// correct. + unsigned delinArgIdx = asResult.getResultNumber(); + SmallVector delinBasis = delinearizeOp.getPaddedBasis(); + OpFoldResult firstDelinBound = delinBasis[delinArgIdx]; + OpFoldResult firstLinBound = linBasis[linArgIdx]; + bool boundsMatch = firstDelinBound == firstLinBound; + bool bothAtFront = linArgIdx == 0 && delinArgIdx == 0; + bool knownByDisjoint = + linearizeOp.getDisjoint() && delinArgIdx == 0 && !firstDelinBound; + if (!boundsMatch && !bothAtFront && !knownByDisjoint) { + linArgIdx++; + continue; + } + + unsigned j = 1; + unsigned numDelinOuts = delinearizeOp.getNumResults(); + for (; j + linArgIdx < numLinArgs && j + delinArgIdx < numDelinOuts; + ++j) { + if (multiIndex[linArgIdx + j] != + delinearizeOp.getResult(delinArgIdx + j)) + break; + if (linBasis[linArgIdx + j] != delinBasis[delinArgIdx + j]) + break; + } + // If there're multiple matches against the same delinearize_index, + // only rewrite the first one we find to prevent invalidations. The next + // ones will be taken care of by subsequent pattern invocations. + if (j <= 1 || !alreadyMatchedDelinearize.insert(delinearizeOp).second) { + linArgIdx++; + continue; + } + matches.push_back(Match{delinearizeOp, linArgIdx, delinArgIdx, j}); + linArgIdx += j; + } - if (delinearizeOp.getResults() != linearizeOp.getMultiIndex()) + if (matches.empty()) return rewriter.notifyMatchFailure( - linearizeOp, "not all indices come from delinearize"); + linearizeOp, "no run of delinearize outputs to deal with"); + + // Record all the delinearize replacements so we can do them after creating + // the new linearization operation, since the new operation might use + // outputs of something we're replacing. + SmallVector> delinearizeReplacements; + + SmallVector newIndex; + newIndex.reserve(numLinArgs); + SmallVector newBasis; + newBasis.reserve(numLinArgs); + unsigned prevMatchEnd = 0; + for (Match m : matches) { + unsigned gap = m.linStart - prevMatchEnd; + llvm::append_range(newIndex, multiIndex.slice(prevMatchEnd, gap)); + llvm::append_range(newBasis, linBasisRef.slice(prevMatchEnd, gap)); + // Update here so we don't forget this during early continues + prevMatchEnd = m.linStart + m.length; + + PatternRewriter::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(m.delinearize); + + ArrayRef basisToMerge = + linBasisRef.slice(m.linStart, m.length); + // We use the slice from the linearize's basis above because of the + // "bounds inferred from `disjoint`" case above. + OpFoldResult newSize = + computeProduct(linearizeOp.getLoc(), rewriter, basisToMerge); + + // Trivial case where we can just skip past the delinearize all together + if (m.length == m.delinearize.getNumResults()) { + newIndex.push_back(m.delinearize.getLinearIndex()); + newBasis.push_back(newSize); + // Pad out set of replacements so we don't do anything with this one. + delinearizeReplacements.push_back(SmallVector()); + continue; + } + + SmallVector newDelinResults; + SmallVector newDelinBasis = m.delinearize.getPaddedBasis(); + newDelinBasis.erase(newDelinBasis.begin() + m.delinStart, + newDelinBasis.begin() + m.delinStart + m.length); + newDelinBasis.insert(newDelinBasis.begin() + m.delinStart, newSize); + auto newDelinearize = rewriter.create( + m.delinearize.getLoc(), m.delinearize.getLinearIndex(), + newDelinBasis); + + // Since there may be other uses of the indices we just merged together, + // create a residual affine.delinearize_index that delinearizes the + // merged output into its component parts. + Value combinedElem = newDelinearize.getResult(m.delinStart); + auto residualDelinearize = rewriter.create( + m.delinearize.getLoc(), combinedElem, basisToMerge); + + // Swap all the uses of the unaffected delinearize outputs to the new + // delinearization so that the old code can be removed if this + // linearize_index is the only user of the merged results. + llvm::append_range(newDelinResults, + newDelinearize.getResults().take_front(m.delinStart)); + llvm::append_range(newDelinResults, residualDelinearize.getResults()); + llvm::append_range( + newDelinResults, + newDelinearize.getResults().drop_front(m.delinStart + 1)); + + delinearizeReplacements.push_back(newDelinResults); + newIndex.push_back(combinedElem); + newBasis.push_back(newSize); + } + llvm::append_range(newIndex, multiIndex.drop_front(prevMatchEnd)); + llvm::append_range(newBasis, linBasisRef.drop_front(prevMatchEnd)); + rewriter.replaceOpWithNewOp( + linearizeOp, newIndex, newBasis, linearizeOp.getDisjoint()); + + for (auto [m, newResults] : + llvm::zip_equal(matches, delinearizeReplacements)) { + if (newResults.empty()) + continue; + rewriter.replaceOp(m.delinearize, newResults); + } - rewriter.replaceOp(linearizeOp, delinearizeOp.getLinearIndex()); return success(); } }; @@ -5096,7 +5314,7 @@ struct DropLinearizeLeadingZero final void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns.add(context); } diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index 717004eb50c0f..a9ac13ad71624 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -1917,12 +1917,12 @@ func.func @linearize_one_element_basis(%arg0: index, %arg1: index) -> index { // ----- -// CHECK-LABEL: func @cancel_linearize_denearize_exact( +// CHECK-LABEL: func @cancel_linearize_delinearize_exact( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: return %[[ARG0]] -func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @cancel_linearize_delinearize_exact(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index return %1 : index @@ -1930,12 +1930,12 @@ func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: i // ----- -// CHECK-LABEL: func @cancel_linearize_denearize_linearize_extra_bound( +// CHECK-LABEL: func @cancel_linearize_delinearize_linearize_extra_bound( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: return %[[ARG0]] -func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @cancel_linearize_delinearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index return %1 : index @@ -1943,12 +1943,12 @@ func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: // ----- -// CHECK-LABEL: func @cancel_linearize_denearize_delinearize_extra_bound( +// CHECK-LABEL: func @cancel_linearize_delinearize_delinearize_extra_bound( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: return %[[ARG0]] -func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @cancel_linearize_delinearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (4, %arg2) : index return %1 : index @@ -1956,31 +1956,252 @@ func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg // ----- +// CHECK-LABEL: func @cancel_linearize_delinearize_head( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (12, 8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (12, 16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (3, 4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_head_delinearize_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (12, 8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (12, 16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head_delinearize_unbounded(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (3, 4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_head_linearize_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head_linearize_unbounded(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_head_both_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (16) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_head_both_unbounded(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (4, 8) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (4, 16) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_tail( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (3, 32) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#1] by (5, 32) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_tail(%arg0: index, %arg1: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index + %1 = affine.linearize_index [%arg1, %0#1, %0#2] by (5, 4, 8) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[ARG0]], %[[ARG2]]] by (9, 30, 7) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_exact(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (2, 3, 5) : index, index, index + %1 = affine.linearize_index [%arg1, %0#0, %0#1, %0#2, %arg2] by (9, 2, 3, 5, 7) : index + return %1 : index +} + +// ----- + +// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) * 16)> + +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact_dynamic_basis( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[SIZEPROD:.+]] = affine.apply #[[$MAP]]()[%[[ARG1]], %[[ARG2]]] +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[C1]], %[[ARG0]], %[[C1]]] by (3, %[[SIZEPROD]], 4) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_exact_dynamic_basis(%arg0: index, %arg1: index, %arg2: index) -> index { + %c1 = arith.constant 1 : index + %0:4 = affine.delinearize_index %arg0 into (2, %arg1, %arg2, 8) : index, index, index, index + %1 = affine.linearize_index [%c1, %0#0, %0#1, %0#2, %0#3, %c1] by (3, 2, %arg1, %arg2, 8, 4) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact_delinearize_unbounded_disjoint( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[LIN:.+]] = affine.linearize_index disjoint [%[[ARG1]], %[[ARG0]], %[[ARG2]]] by (9, 30, 7) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_exact_delinearize_unbounded_disjoint(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index + %1 = affine.linearize_index disjoint [%arg1, %0#0, %0#1, %0#2, %arg2] by (9, 2, 3, 5, 7) : index + return %1 : index +} + +// ----- + +// Unlike in the test above, the linerize indices aren't asserted to be disjoint, so +// we can't know if the `2` from the basis is a correct bound. +// CHECK-LABEL: func @dont_cancel_linearize_delinearize_middle_exact_delinearize_unbounded( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (3) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#0, %[[DELIN]]#1, %[[ARG2]]] by (9, 2, 3, 7) +// CHECK: return %[[LIN]] + +func.func @dont_cancel_linearize_delinearize_middle_exact_delinearize_unbounded(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:2 = affine.delinearize_index %arg0 into (3) : index, index + %1 = affine.linearize_index [%arg1, %0#0, %0#1, %arg2] by (9, 2, 3, 7) : index + return %1 : index +} + +// ----- + +// The presence of a `disjoint` here tells us that the "unbounded" term on the +// delinearization can't have been above 2. +// CHECK-LABEL: func @cancel_linearize_delinearize_middle_delinearize_unbounded_disjoint_implied_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-z0-9]+]]: index) +// CHECK: %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (6, 5) +// CHECK: %[[LIN:.+]] = affine.linearize_index disjoint [%[[ARG1]], %[[DELIN]]#0, %[[ARG2]]] by (9, 6, 7) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_middle_delinearize_unbounded_disjoint_implied_bound(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index + %1 = affine.linearize_index disjoint [%arg1, %0#0, %0#1, %arg2] by (9, 2, 3, 7) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_multiple_matches( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[C0:.+]] = arith.constant 0 +// CHECK: %[[DELIN:.+]]:4 = affine.delinearize_index %[[ARG0]] into (4, 16, 4, 64) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#1, %[[C0]], %[[DELIN]]#3] by (4, 16, 4, 64) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_multiple_matches(%arg0: index, %arg1: index) -> index { + %c0 = arith.constant 0 : index + %0:7 = affine.delinearize_index %arg0 into (4, 4, 4, 4, 4, 4, 4) : index, index, index, index, index, index, index + %1 = affine.linearize_index [%arg1, %0#1, %0#2, %c0, %0#4, %0#5, %0#6] by (4, 4, 4, 4, 4, 4, 4) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_delinearize_multiple_delinearizes( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (32, 32) +// CHECK: return %[[LIN]] +func.func @cancel_linearize_delinearize_multiple_delinearizes(%arg0: index, %arg1: index) -> index { + %0:2 = affine.delinearize_index %arg0 into (4, 8) : index, index + %1:2 = affine.delinearize_index %arg1 into (2, 16) : index, index + %2 = affine.linearize_index [%0#0, %0#1, %1#0, %1#1] by (4, 8, 2, 16) : index + return %2 : index +} + +// ----- + // Don't cancel because the values from the delinearize aren't used in order -// CHECK-LABEL: func @no_cancel_linearize_denearize_permuted( +// CHECK-LABEL: func @no_cancel_linearize_delinearize_permuted( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[ARG2]]) -// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], 4, %[[ARG2]]) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], %[[ARG2]], 4) // CHECK: return %[[LIN]] -func.func @no_cancel_linearize_denearize_permuted(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @no_cancel_linearize_delinearize_permuted(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index - %1 = affine.linearize_index [%0#0, %0#2, %0#1] by (%arg1, 4, %arg2) : index + %1 = affine.linearize_index [%0#0, %0#2, %0#1] by (%arg1, %arg2, 4) : index + return %1 : index +} + +// ----- + +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 3)> +// But these cancel because they're a contiguous segment +// CHECK-LABEL: func @partial_cancel_linearize_delinearize_not_fully_permuted( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) +// CHECK: %[[SIZEPROD:.+]] = affine.apply #[[$MAP]]()[%[[ARG2]]] +// CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[SIZEPROD]]) +// CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], %[[SIZEPROD]], 4) +// CHECK: return %[[LIN]] +func.func @partial_cancel_linearize_delinearize_not_fully_permuted(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:4 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2, 3) : index, index, index, index + %1 = affine.linearize_index [%0#0, %0#2, %0#3, %0#1] by (%arg1, %arg2, 3, 4) : index return %1 : index } // ----- +// Ensure we don't get SSA errors when creating new `affine.delinearize` operations. +// CHECK-LABEL: func @cancel_linearize_delinearize_placement +// CHECK-SAME: (%[[ARG0:.+]]: index) +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[NEW_DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8, 32) : index, index +// CHECK-NEXT: %[[DELIN_PART:.+]]:2 = affine.delinearize_index %[[NEW_DELIN]]#1 into (8, 4) : index, index +// CHECK-NEXT: %[[L1:.+]] = affine.linearize_index disjoint [%[[DELIN_PART]]#1, %[[NEW_DELIN]]#0, %[[C0]], %[[C0]]] by (4, 8, 4, 8) +// CHECK-NEXT: %[[L2:.+]] = affine.linearize_index disjoint [%[[NEW_DELIN]]#1, %[[C0]], %[[C0]]] by (32, 8, 4) +// CHECK-NEXT: %[[L3:.+]] = affine.linearize_index disjoint [%[[DELIN_PART]]#0, %[[NEW_DELIN]]#0, %[[C0]], %[[C0]]] by (8, 8, 4, 4) +// CHECK-NEXT: return %[[L1]], %[[L2]], %[[L3]] +func.func @cancel_linearize_delinearize_placement(%arg0: index) -> (index, index, index) { + %c0 = arith.constant 0 : index + %0:3 = affine.delinearize_index %arg0 into (8, 8, 4) : index, index, index + %1 = affine.linearize_index disjoint [%0#2, %0#0, %c0, %c0] by (4, 8, 4, 8) : index + %2 = affine.linearize_index disjoint [%0#1, %0#2, %c0, %c0] by (8, 4, 8, 4) : index + %3 = affine.linearize_index disjoint [%0#1, %0#0, %c0, %c0] by (8, 8, 4, 4) : index + return %1, %2, %3 : index, index, index +} + +// ----- + // Won't cancel because the linearize and delinearize are using a different basis -// CHECK-LABEL: func @no_cancel_linearize_denearize_different_basis( +// CHECK-LABEL: func @no_cancel_linearize_delinearize_different_basis( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) // CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[ARG2]]) // CHECK: %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2] by (%[[ARG1]], 8, %[[ARG2]]) // CHECK: return %[[LIN]] -func.func @no_cancel_linearize_denearize_different_basis(%arg0: index, %arg1: index, %arg2: index) -> index { +func.func @no_cancel_linearize_delinearize_different_basis(%arg0: index, %arg1: index, %arg2: index) -> index { %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 8, %arg2) : index return %1 : index From 18b47373cb47f1f63ab1f6e126ccfb22cc52963c Mon Sep 17 00:00:00 2001 From: jmriesen <20286401+jmriesen@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:18:39 -0600 Subject: [PATCH 126/480] Updating broken/outdated links in the ProgrammerManual (#119472) Fixes llvm/llvm-project#117897 --- llvm/docs/ProgrammersManual.rst | 6 +++--- llvm/include/llvm/IR/PassManager.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index 98803ddffd082..e2829eb5a8846 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -3358,15 +3358,15 @@ the ``PassManager.h`` system, and there is a more detailed introduction to it by Sean Parent in several of his talks and papers: #. `Inheritance Is The Base Class of Evil - `_ + `_ - The GoingNative 2013 talk describing this technique, and probably the best place to start. #. `Value Semantics and Concepts-based Polymorphism `_ - The C++Now! 2012 talk describing this technique in more detail. #. `Sean Parent's Papers and Presentations - `_ - - A GitHub project full of links to slides, video, and sometimes code. + `_ + - Links to slides, videos, and sometimes code. When deciding between creating a type hierarchy (with either tagged or virtual dispatch) and using templates or concepts-based polymorphism, consider whether diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index 5dab9d0d0a797..b5230047b0e12 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -28,9 +28,9 @@ /// polymorphism as outlined in the "Value Semantics and Concept-based /// Polymorphism" talk (or its abbreviated sibling "Inheritance Is The Base /// Class of Evil") by Sean Parent: -/// * http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations +/// * https://sean-parent.stlab.cc/papers-and-presentations /// * http://www.youtube.com/watch?v=_BpMYeUFXv8 -/// * http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil +/// * https://learn.microsoft.com/en-us/shows/goingnative-2013/inheritance-base-class-of-evil /// //===----------------------------------------------------------------------===// From d85b22ed5dbb794835fd4b5166d5bb79ad9e09f2 Mon Sep 17 00:00:00 2001 From: kefan cao <45958009+caokefan@users.noreply.github.com> Date: Sat, 4 Jan 2025 05:32:02 +0800 Subject: [PATCH 127/480] [Clang][ASTMatcher] Add `dependentTemplateSpecializationType` matcher (#121435) Fixes https://github.com/llvm/llvm-project/issues/121307 --- clang/docs/LibASTMatchersReference.html | 11 +++++++++++ clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/ASTMatchers/ASTMatchers.h | 12 ++++++++++++ clang/lib/ASTMatchers/ASTMatchersInternal.cpp | 2 ++ clang/lib/ASTMatchers/Dynamic/Registry.cpp | 1 + clang/unittests/AST/ASTImporterTest.cpp | 4 ---- .../unittests/ASTMatchers/ASTMatchersNodeTest.cpp | 15 +++++++++++++++ 7 files changed, 43 insertions(+), 4 deletions(-) diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index 8564f2650d205..fc55788801325 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -2546,6 +2546,17 @@

Node Matchers

}; +Matcher<Type>dependentTemplateSpecializationTypeMatcher<DependentTemplateSpecializationType>... +
Matches a dependent template specialization type.
+
+Example matches A::template B
+
+  template struct A;
+  template struct declToImport {
+    typename A::template B a;
+  };
+
+ Matcher<Type>deducedTemplateSpecializationTypeMatcher<DeducedTemplateSpecializationType>...
Matches C++17 deduced template specialization types, e.g. deduced class
 template types.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 61d6aa2216cd0..5e75fc447636e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1114,6 +1114,8 @@ AST Matchers
 
 - Add ``dependentNameType`` matcher to match a dependent name type.
 
+- Add ``dependentTemplateSpecializationType`` matcher to match a dependent template specialization type.
+
 clang-format
 ------------
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 9a046714068a5..dd0fedb2cda2d 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -7721,6 +7721,18 @@ AST_MATCHER_P(DecayedType, hasDecayedType, internal::Matcher,
 /// \endcode
 extern const AstTypeMatcher dependentNameType;
 
+/// Matches a dependent template specialization type
+///
+/// Example matches A::template B
+/// \code
+///   template struct A;
+///   template struct declToImport {
+///     typename A::template B a;
+///   };
+/// \endcode
+extern const AstTypeMatcher
+    dependentTemplateSpecializationType;
+
 /// Matches declarations whose declaration context, interpreted as a
 /// Decl, matches \c InnerMatcher.
 ///
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index a47633bf4bae2..9c7943a98d652 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -1109,6 +1109,8 @@ const AstTypeMatcher templateTypeParmType;
 const AstTypeMatcher injectedClassNameType;
 const AstTypeMatcher decayedType;
 const AstTypeMatcher dependentNameType;
+const AstTypeMatcher
+    dependentTemplateSpecializationType;
 AST_TYPELOC_TRAVERSE_MATCHER_DEF(hasElementType,
                                  AST_POLYMORPHIC_SUPPORTED_TYPES(ArrayType,
                                                                  ComplexType));
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index bfdee412c5328..97e6bbc093fe4 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -224,6 +224,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(declRefExpr);
   REGISTER_MATCHER(dependentNameType);
   REGISTER_MATCHER(dependentScopeDeclRefExpr);
+  REGISTER_MATCHER(dependentTemplateSpecializationType);
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(declaratorDecl);
   REGISTER_MATCHER(decltypeType);
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index ee1d896f1ca6d..d197d30df3adf 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -763,10 +763,6 @@ TEST_P(ImportType, ImportPackExpansion) {
                                    implicitCastExpr(has(declRefExpr()))))))));
 }
 
-const internal::VariadicDynCastAllOfMatcher
-    dependentTemplateSpecializationType;
-
 TEST_P(ImportType, ImportDependentTemplateSpecialization) {
   MatchVerifier Verifier;
   testImport("template"
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index b8521e2f95768..680e21840b7d3 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1926,6 +1926,21 @@ TEST_P(ASTMatchersTest, DependentNameType) {
       dependentNameType()));
 }
 
+TEST_P(ASTMatchersTest, DependentTemplateSpecializationType) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template struct A;
+        template struct declToImport {
+          typename A::template B a;
+        };
+      )",
+      dependentTemplateSpecializationType()));
+}
+
 TEST_P(ASTMatchersTest, RecordType) {
   EXPECT_TRUE(matches("struct S {}; struct S s;",
                       recordType(hasDeclaration(recordDecl(hasName("S"))))));

From d7acf03cecef0bc62240c97a890077755323424f Mon Sep 17 00:00:00 2001
From: Brox Chen 
Date: Fri, 3 Jan 2025 16:32:15 -0500
Subject: [PATCH 128/480] [AMDGPU][True16][MC] true16 for v_rndne_f16 (#120691)

Support true16 format for v_rndne_b16 in MC
---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   2 +-
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |  41 +++++
 llvm/test/MC/AMDGPU/gfx11_asm_vop1.s          |  75 +++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s    |  65 ++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s     |  21 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s  |  42 +++++
 .../MC/AMDGPU/gfx11_asm_vop1_t16_promote.s    | 154 +++++++++++++-----
 .../AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s   |  65 ++++----
 .../MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s |  25 ++-
 .../test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s |  69 ++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop1.s          |  72 ++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s    |  62 +++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s     |  18 +-
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s  |  42 +++++
 .../MC/AMDGPU/gfx12_asm_vop1_t16_promote.s    | 154 +++++++++++++-----
 .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s |  69 ++++----
 .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s   |  65 ++++----
 .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s |  25 ++-
 .../Disassembler/AMDGPU/gfx11_dasm_vop1.txt   |  63 +++++--
 .../AMDGPU/gfx11_dasm_vop1_dpp16.txt          |  54 ++++--
 .../AMDGPU/gfx11_dasm_vop1_dpp8.txt           |  17 +-
 .../gfx11_dasm_vop3_dpp16_from_vop1.txt       |  54 ++++--
 .../AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt |  24 ++-
 .../AMDGPU/gfx11_dasm_vop3_from_vop1.txt      |  57 +++++--
 .../AMDGPU/gfx12_dasm_vop1_dpp16.txt          |  50 ++++--
 .../AMDGPU/gfx12_dasm_vop1_dpp8.txt           |  13 +-
 .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt      |  57 +++++--
 .../gfx12_dasm_vop3_from_vop1_dpp16.txt       |  54 ++++--
 .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt |  24 ++-
 29 files changed, 1071 insertions(+), 462 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index fc22b539d7153..f0d2fe0f4f547 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1043,7 +1043,7 @@ defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f1
 defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
 defm V_CEIL_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
 defm V_TRUNC_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">;
-defm V_RNDNE_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
+defm V_RNDNE_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e, "v_rndne_f16">;
 defm V_FRACT_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">;
 defm V_SIN_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">;
 defm V_COS_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061, "v_cos_f16">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 4de0c548ad381..795ed6d542a13 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -63,6 +64,24 @@ define amdgpu_kernel void @rint_f16(
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -168,6 +187,28 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 40a6e434b438d..2480be97a7a64 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3044,50 +3044,65 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX11: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v127.l
+// GFX11: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s1
+// GFX11: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s105
+// GFX11: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX11: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX11: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX11: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX11: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX11: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX11: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, 0.5
-// GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX11: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, src_scc
-// GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX11: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v5.l, v1.h
+// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.h
+// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v127.l, 0.5
+// GFX11: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX11: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
index 706cb6e32f88a..0f77279397485 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
@@ -2399,47 +2399,56 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rndne_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
index d7051aff42d77..4a89305a5b353 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
@@ -593,14 +593,23 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 263ad4bf513a1..7d29adcd73ccc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -833,6 +833,12 @@ v_rcp_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -842,6 +848,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -851,6 +875,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
index 42c36538f2bf6..f2dbb782186f6 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
@@ -1943,71 +1943,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX11: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX11: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v128.l, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX11: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, -1
+// GFX11: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX11: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, 0.5
+// GFX11: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX11: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_hi
+// GFX11: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, m0
-// GFX11: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_lo
+// GFX11: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, null
-// GFX11: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, m0
+// GFX11: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s1
-// GFX11: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, null
+// GFX11: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s105
-// GFX11: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s1
+// GFX11: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, src_scc
-// GFX11: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s105
+// GFX11: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, ttmp15
-// GFX11: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX11: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX11: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX11: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v1.h
+// GFX11: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, v127
-// GFX11: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v127.h
+// GFX11: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, vcc_hi
-// GFX11: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, vcc_lo
-// GFX11: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX11: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, -1
+// GFX11: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, 0.5
+// GFX11: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_hi
+// GFX11: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_lo
+// GFX11: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, m0
+// GFX11: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, null
+// GFX11: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s1
+// GFX11: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s105
+// GFX11: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, src_scc
+// GFX11: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, ttmp15
+// GFX11: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l
+// GFX11: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX11: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX11: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX11: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX11: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
index 874fb5bffa0ad..b0a9478203a34 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
@@ -2506,47 +2506,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
index 8e6783e0f413c..eae5d3e799ba7 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
@@ -742,17 +742,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
index 3f9af472a6372..9ecae211ecd86 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
@@ -2977,50 +2977,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX11: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v255.l
+// GFX11: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX11: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX11: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX11: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX11: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX11: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX11: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX11: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX11: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index 6c69f3fb78bc0..089ad41448f00 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -3102,50 +3102,62 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX12: v_readfirstlane_b32 null, v255 ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX12: v_rndne_f16_e32 v5, v1 ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX12: v_rndne_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX12: v_rndne_f16_e32 v5, v127 ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v127.l
+// GFX12: v_rndne_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX12: v_rndne_f16_e32 v5, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s1
+// GFX12: v_rndne_f16_e32 v5.l, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX12: v_rndne_f16_e32 v5, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, s105
+// GFX12: v_rndne_f16_e32 v5.l, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX12: v_rndne_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX12: v_rndne_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX12: v_rndne_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX12: v_rndne_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX12: v_rndne_f16_e32 v5, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX12: v_rndne_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX12: v_rndne_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX12: v_rndne_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX12: v_rndne_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX12: v_rndne_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX12: v_rndne_f16_e32 v5, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX12: v_rndne_f16_e32 v5.l, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX12: v_rndne_f16_e32 v5, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX12: v_rndne_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, 0.5
-// GFX12: v_rndne_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX12: v_rndne_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, src_scc
-// GFX12: v_rndne_f16_e32 v5, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX12: v_rndne_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX12: v_rndne_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v5.l, v1.h
+// GFX12: v_rndne_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.h
+// GFX12: v_rndne_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX12: v_rndne_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX12: v_rndne_f32_e32 v5, v1 ; encoding: [0x01,0x47,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index 05a5f8bd44b9c..fc6b9f396a6a7 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -2452,47 +2452,53 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index bf03e7f8e518c..a77b95e1ef0cd 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -604,14 +604,20 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
index f584b69c33ec8..0be79d016b78f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
@@ -764,6 +764,12 @@ v_rcp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -773,6 +779,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -782,6 +806,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
index 27e92b7e4f22b..440c1f09f6012 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
@@ -1903,71 +1903,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX12: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX12: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v128.l, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX12: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, -1
+// GFX12: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX12: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, 0.5
+// GFX12: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX12: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_hi
+// GFX12: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, m0
-// GFX12: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, exec_lo
+// GFX12: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, null
-// GFX12: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, m0
+// GFX12: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s1
-// GFX12: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, null
+// GFX12: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, s105
-// GFX12: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s1
+// GFX12: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, src_scc
-// GFX12: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s105
+// GFX12: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, ttmp15
-// GFX12: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX12: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX12: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX12: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v1.h
+// GFX12: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, v127
-// GFX12: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.h, v127.h
+// GFX12: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, vcc_hi
-// GFX12: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, vcc_lo
-// GFX12: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX12: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, -1
+// GFX12: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, 0.5
+// GFX12: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_hi
+// GFX12: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, exec_lo
+// GFX12: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, m0
+// GFX12: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, null
+// GFX12: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s1
+// GFX12: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, s105
+// GFX12: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, src_scc
+// GFX12: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, ttmp15
+// GFX12: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l
+// GFX12: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX12: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX12: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX12: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX12: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index 0ba9874b1a22e..4824241735140 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -3127,50 +3127,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX12: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v255.l
+// GFX12: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX12: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX12: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX12: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX12: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX12: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX12: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX12: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX12: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16_e64 v5.h, v1.h
+// GFX12: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX12: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 197f02719905d..c09471033d144 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -2377,47 +2377,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index 0dfc47b4e4020..be3878878b13d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -712,17 +712,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index 8d86bafca059f..a3886e6b3a68d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -3090,49 +3090,82 @@
 # GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
 0x01,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
 
 0x7f,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
 0x01,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
 
 0x69,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
 
 0x6a,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
 0x6b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
 0x7b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
 0x7d,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
 0x7e,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
 0x7f,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
 0x7c,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
 0xc1,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
 0xf0,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
 0xfd,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
 0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+0xff,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+0xf0,0xbc,0xfe,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0.5               ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+0xfd,0xbc,0x0a,0x7f
+# GFX11-REAL16: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x47,0x0a,0x7e
 # GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
index f01ce5a31be3c..9f857cd05696c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
@@ -2473,46 +2473,72 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
index 0f102c52a3666..c45033916cd05 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
@@ -509,10 +509,23 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
index 6e5239522df41..4f12775fb3796 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
@@ -2615,46 +2615,72 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
index d7f9e86d3ca00..638daca3fdd4f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
@@ -731,16 +731,32 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
index db5ba967d709b..1b7677b8c088c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
@@ -3051,49 +3051,76 @@
 # GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index d37c9229f1666..1635fdab66d86 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -2581,46 +2581,68 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index f3dce5e6d5b93..c1fa6aa634f49 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -497,10 +497,19 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index e4b619d87e400..43c18a7836687 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -3097,49 +3097,76 @@
 # GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index b77cf5ab6efc1..cc344f329c2d2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -2471,46 +2471,72 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 50339f51c5629..428349fec54fa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -707,16 +707,32 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]

From 78f04477d658ae3de3cd416947c5bac65262a9ec Mon Sep 17 00:00:00 2001
From: Maksim Levental 
Date: Fri, 3 Jan 2025 13:52:07 -0800
Subject: [PATCH 129/480] [mlir][python] declare `_PyClassMethod_New` undefined
 at link time (#121597)

`NanobindAdaptors.h` uses `PyClassMethod_New` to build `pure_subclass`es
but nanobind doesn't declare this API as undefined in its linker flags.
So we need to declare it as such for downstream users that do not do
something like `-undefined dynamic_lookup`
---
 mlir/cmake/modules/AddMLIRPython.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 9d4e06c7909c8..717a503468a85 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -683,6 +683,13 @@ function(add_mlir_python_extension libname extname)
           ${eh_rtti_enable}
       )
     endif()
+    
+    if(APPLE)
+      # NanobindAdaptors.h uses PyClassMethod_New to build `pure_subclass`es but nanobind
+      # doesn't declare this API as undefined in its linker flags. So we need to declare it as such
+      # for downstream users that do not do something like `-undefined dynamic_lookup`.
+      set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-U -Wl,_PyClassMethod_New")
+    endif()
   endif()
 
   target_compile_options(${libname} PRIVATE ${eh_rtti_enable})

From 9165848c8285884938583f5c3a35c97ec03ee486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Fri, 3 Jan 2025 14:37:14 -0800
Subject: [PATCH 130/480] [flang][cuda] Sync global descriptor when nullifying
 pointer (#121595)

---
 .../flang/Optimizer/Builder/CUFCommon.h       |  6 ++++++
 flang/lib/Lower/Allocatable.cpp               | 19 ++-----------------
 flang/lib/Lower/Bridge.cpp                    |  2 ++
 flang/lib/Optimizer/Builder/CUFCommon.cpp     | 17 +++++++++++++++++
 flang/test/Lower/CUDA/cuda-pointer-sync.cuf   |  6 +++++-
 5 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index df1b709dc8608..b99e330429622 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -15,6 +15,10 @@
 
 static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
 
+namespace fir {
+class FirOpBuilder;
+} // namespace fir
+
 namespace cuf {
 
 /// Retrieve or create the CUDA Fortran GPU module in the given \p mod.
@@ -24,6 +28,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
 bool isInCUDADeviceContext(mlir::Operation *op);
 bool isRegisteredDeviceGlobal(fir::GlobalOp op);
 
+void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 4c64870675816..5c63c79892f42 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -1088,22 +1088,6 @@ bool Fortran::lower::isArraySectionWithoutVectorSubscript(
          !Fortran::evaluate::HasVectorSubscript(expr);
 }
 
-static void genCUFPointerSync(const mlir::Value box,
-                              fir::FirOpBuilder &builder) {
-  if (auto declareOp = box.getDefiningOp()) {
-    if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) {
-      auto mod = addrOfOp->getParentOfType();
-      if (auto globalOp =
-              mod.lookupSymbol(addrOfOp.getSymbol())) {
-        if (cuf::isRegisteredDeviceGlobal(globalOp)) {
-          builder.create(box.getLoc(),
-                                                addrOfOp.getSymbol());
-        }
-      }
-    }
-  }
-}
-
 void Fortran::lower::associateMutableBox(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     const fir::MutableBoxValue &box, const Fortran::lower::SomeExpr &source,
@@ -1111,12 +1095,13 @@ void Fortran::lower::associateMutableBox(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (Fortran::evaluate::UnwrapExpr(source)) {
     fir::factory::disassociateMutableBox(builder, loc, box);
+    cuf::genPointerSync(box.getAddr(), builder);
     return;
   }
   if (converter.getLoweringOptions().getLowerToHighLevelFIR()) {
     fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx);
     fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds);
-    genCUFPointerSync(box.getAddr(), builder);
+    cuf::genPointerSync(box.getAddr(), builder);
     return;
   }
   // The right hand side is not be evaluated into a temp. Array sections can
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c7e2635230e98..c7bf424815548 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -34,6 +34,7 @@
 #include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Support/Utils.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/Assign.h"
@@ -3952,6 +3953,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       } else {
         fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
         fir::factory::disassociateMutableBox(*builder, loc, box);
+        cuf::genPointerSync(box.getAddr(), *builder);
       }
     }
   }
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index 81a8a90ce394e..39848205f47af 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 
@@ -54,3 +56,18 @@ bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
     return true;
   return false;
 }
+
+void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
+  if (auto declareOp = box.getDefiningOp()) {
+    if (auto addrOfOp = declareOp.getMemref().getDefiningOp()) {
+      auto mod = addrOfOp->getParentOfType();
+      if (auto globalOp =
+              mod.lookupSymbol(addrOfOp.getSymbol())) {
+        if (cuf::isRegisteredDeviceGlobal(globalOp)) {
+          builder.create(box.getLoc(),
+                                                addrOfOp.getSymbol());
+        }
+      }
+    }
+  }
+}
diff --git a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
index e17869b2d6357..4c64f4bd34aa0 100644
--- a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
+++ b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
@@ -8,10 +8,14 @@ use devptr
 real, device, target, dimension(4) :: a_dev
 a_dev = 42.0
 dev_ptr => a_dev
+
+dev_ptr => null()
+
+nullify(dev_ptr)
 end
 
 ! CHECK: fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda} : !fir.box>>
 ! CHECK-LABEL: func.func @_QQmain()
 ! CHECK: fir.embox
 ! CHECK: fir.store
-! CHECK: cuf.sync_descriptor @_QMdevptrEdev_ptr
+! CHECK-COUNT-3: cuf.sync_descriptor @_QMdevptrEdev_ptr

From 1b5deaeb2ad0a7ea643f24899e4aad9461d3d426 Mon Sep 17 00:00:00 2001
From: Tom Stellard 
Date: Fri, 3 Jan 2025 15:02:16 -0800
Subject: [PATCH 131/480] workflows/build-ci-container: Make sure to only test
 local containers (#120827)

The container test is run before we create the :latest tag, so we should
not try testing this, otherwise it will pull the :latest tag from the
github registry, and won't test the container we just built.
---
 .github/workflows/build-ci-container.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 50729e0173506..4fa0713b381ce 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -59,8 +59,9 @@ jobs:
 
       - name: Test Container
         run: |
-          for image in ${{ steps.vars.outputs.container-name-tag }} ${{  steps.vars.outputs.container-name }}; do
-            podman run --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include \nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
+          for image in ${{ steps.vars.outputs.container-name-tag }}; do
+            # Use --pull=never to ensure we are testing the just built image.
+            podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include \nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
           done
 
   push-ci-container:

From 06cf4f970446ce3c4be0a7104115b82c2fae6448 Mon Sep 17 00:00:00 2001
From: Tom Stellard 
Date: Fri, 3 Jan 2025 15:06:35 -0800
Subject: [PATCH 132/480] workflows/new-issues: Use an llvmbot token to add
 labels (#120840)

There is a separate job that mentions teams based on the label added,
and this job won't run if we use the default github token.
---
 .github/workflows/new-issues.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
index ed15fdb9fba6e..3cac57e268513 100644
--- a/.github/workflows/new-issues.yml
+++ b/.github/workflows/new-issues.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
       - uses: llvm/actions/issue-labeler@main
         with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           configuration-path: .github/new-issues-labeler.yml
           include-title: 1
           include-body: 0

From dfa4312c9b092c23b9b2ec366a8851be729953c4 Mon Sep 17 00:00:00 2001
From: Tom Stellard 
Date: Fri, 3 Jan 2025 15:08:40 -0800
Subject: [PATCH 133/480] workflows/release-binaries: Replace some workflow
 interpolations with env vars (#120860)

This is recommended by the GitHub Actions security hardening guide:
https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
---
 .github/workflows/release-binaries.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 1cde628d3f66c..fc5431c96bbf0 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -83,7 +83,7 @@ jobs:
         USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
       shell: bash
       run: |
-        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
+        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user "$GITHUB_ACTOR" --user-token "$USER_TOKEN" check-permissions
 
     - name: Collect Variables
       id: vars
@@ -102,8 +102,8 @@ jobs:
           release_version="$trimmed"
           ref="llvmorg-$release_version"
         else
-          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-${{ github.sha }}"
-          ref=${{ github.sha }}
+          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-$GITHUB_SHA"
+          ref="$GITHUB_SHA"
         fi
         if [ -n "${{ inputs.upload }}" ]; then
           upload="${{ inputs.upload }}"
@@ -114,20 +114,20 @@ jobs:
         echo "ref=$ref" >> $GITHUB_OUTPUT
         echo "upload=$upload" >> $GITHUB_OUTPUT
 
-        release_binary_basename="LLVM-$release_version-${{ runner.os }}-${{ runner.arch }}"
+        release_binary_basename="LLVM-$release_version-$RUNNER_OS-$RUNNER_ARCH"
         echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
         echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
 
         # Detect necessary CMake flags
-        target="${{ runner.os }}-${{ runner.arch }}"
+        target="$RUNNER_OS-$RUNNER_ARCH"
         echo "enable-pgo=false" >> $GITHUB_OUTPUT
         target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
         # The macOS builds try to cross compile some libraries so we need to
         # add extra CMake args to disable them.
         # See https://github.com/llvm/llvm-project/issues/99767
-        if [ "${{ runner.os }}" = "macOS" ]; then
+        if [ "$RUNNER_OS" = "macOS" ]; then
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
-          if [ "${{ runner.arch }}" = "ARM64" ]; then
+          if [ "$RUNNER_ARCH" = "ARM64" ]; then
             arches=arm64
           else
             arches=x86_64
@@ -137,7 +137,7 @@ jobs:
 
         build_flang="true"
 
-        if [ "${{ runner.os }}" = "Windows" ]; then
+        if [ "$RUNNER_OS" = "Windows" ]; then
           # The build times out on Windows, so we need to disable LTO.
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi

From ee1adc5aab4fb517314358ce03cfda426da9c4ce Mon Sep 17 00:00:00 2001
From: Adrian Prantl 
Date: Fri, 3 Jan 2025 15:26:40 -0800
Subject: [PATCH 134/480] [lldb] Add a return opcode to the formatter bytecode
 (#121602)

In LLVM we love our early exists and this opcode allows for simpler code
generation.
---
 lldb/docs/resources/formatterbytecode.rst              |  1 +
 lldb/examples/python/formatter_bytecode.py             |  4 ++++
 lldb/source/DataFormatters/FormatterBytecode.cpp       |  3 +++
 lldb/source/DataFormatters/FormatterBytecode.def       |  1 +
 lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp | 10 ++++++++++
 5 files changed, 19 insertions(+)

diff --git a/lldb/docs/resources/formatterbytecode.rst b/lldb/docs/resources/formatterbytecode.rst
index 20e148363ef95..34fb0f7ee924c 100644
--- a/lldb/docs/resources/formatterbytecode.rst
+++ b/lldb/docs/resources/formatterbytecode.rst
@@ -75,6 +75,7 @@ These manipulate the control stack and program counter. Both `if` and `ifelse` e
  0x12      `ifelse`    `(UInt -> )` pop two blocks from the control stack, if
                        the top of the data stack is nonzero, execute the first,
                        otherwise the second.
+ 0x13      `return`    pop the entire control stack and return
 ========  ==========  ============================================================
 
 Literals for basic types
diff --git a/lldb/examples/python/formatter_bytecode.py b/lldb/examples/python/formatter_bytecode.py
index ccd0c68a75483..36a14be283f31 100644
--- a/lldb/examples/python/formatter_bytecode.py
+++ b/lldb/examples/python/formatter_bytecode.py
@@ -35,6 +35,7 @@ def define_opcode(n, mnemonic, name):
 define_opcode(0x10, "{", "begin")
 define_opcode(0x11, "if", "if")
 define_opcode(0x12, "ifelse", "ifelse")
+define_opcode(0x13, "return", "return")
 
 define_opcode(0x20, None, "lit_uint")
 define_opcode(0x21, None, "lit_int")
@@ -342,6 +343,9 @@ def next_byte():
             else:
                 frame.append(control.pop())
                 control.pop()
+        elif b == op_return:
+            control.clear()
+            return data[-1]
 
         # Literals.
         elif b == op_lit_uint:
diff --git a/lldb/source/DataFormatters/FormatterBytecode.cpp b/lldb/source/DataFormatters/FormatterBytecode.cpp
index e49c750678187..7f3dbe0dba37d 100644
--- a/lldb/source/DataFormatters/FormatterBytecode.cpp
+++ b/lldb/source/DataFormatters/FormatterBytecode.cpp
@@ -304,6 +304,9 @@ llvm::Error Interpret(std::vector &control,
       control.pop_back();
       activate_block();
       continue;
+    case op_return:
+      control.clear();
+      return pc.takeError();
 
     // Literals.
     case op_lit_uint:
diff --git a/lldb/source/DataFormatters/FormatterBytecode.def b/lldb/source/DataFormatters/FormatterBytecode.def
index c6645631fa006..29e0bee541c73 100644
--- a/lldb/source/DataFormatters/FormatterBytecode.def
+++ b/lldb/source/DataFormatters/FormatterBytecode.def
@@ -27,6 +27,7 @@ DEFINE_OPCODE(0x06, "rot",  rot)
 DEFINE_OPCODE(0x10, "{", begin)
 DEFINE_OPCODE(0x11, "if", if)
 DEFINE_OPCODE(0x12, "ifelse", ifelse)
+DEFINE_OPCODE(0x13, "return", return)
 
 DEFINE_OPCODE(0x20, nullptr, lit_uint)
 DEFINE_OPCODE(0x21, nullptr, lit_int)
diff --git a/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp b/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
index 7307db650c162..5e980c3e1913c 100644
--- a/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
+++ b/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
@@ -97,6 +97,16 @@ TEST_F(FormatterBytecodeTest, ControlOps) {
                           data));
     ASSERT_EQ(data.Pop(), 42u);
   }
+  {
+    DataStack data;
+    ASSERT_TRUE(Interpret({op_lit_uint, 1, op_begin, 3, op_lit_uint, 42,
+                           op_return, op_if, op_lit_uint, 23},
+                          data));
+    ASSERT_EQ(data.Pop(), 42u);
+  }
+}
+
+TEST_F(FormatterBytecodeTest, ConversionOps) {
   {
     DataStack data(lldb::ValueObjectSP{});
     ASSERT_TRUE(Interpret({op_is_null}, data));

From b7637a855722b608ce2fb5aa860149db9b881197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Fri, 3 Jan 2025 15:27:41 -0800
Subject: [PATCH 135/480] [flang][cuda] Set PINNED variable to false in
 ALLOCATE (#121593)

When `PINNED=` is used with variables that don't have the `PINNED`
attribute, the logical value must be set to false when host allocation
is performed.
---
 flang/lib/Lower/Allocatable.cpp            | 33 +++++++++++++++++-----
 flang/test/Lower/CUDA/cuda-allocatable.cuf | 27 ++++++++++++++++++
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 5c63c79892f42..dc135543fafc7 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -454,6 +454,19 @@ class AllocateStmtHelper {
                                                    alloc.getSymbol());
   }
 
+  void setPinnedToFalse() {
+    if (!pinnedExpr)
+      return;
+    Fortran::lower::StatementContext stmtCtx;
+    mlir::Value pinned =
+        fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx));
+    mlir::Location loc = pinned.getLoc();
+    mlir::Value falseValue = builder.createBool(loc, false);
+    mlir::Value falseConv = builder.createConvert(
+        loc, fir::unwrapRefType(pinned.getType()), falseValue);
+    builder.create(loc, falseConv, pinned);
+  }
+
   void genSimpleAllocation(const Allocation &alloc,
                            const fir::MutableBoxValue &box) {
     bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
@@ -469,6 +482,7 @@ class AllocateStmtHelper {
       // can be validated.
       genInlinedAllocation(alloc, box);
       postAllocationAction(alloc);
+      setPinnedToFalse();
       return;
     }
 
@@ -482,11 +496,13 @@ class AllocateStmtHelper {
     genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (!isCudaSymbol)
+    if (!isCudaSymbol) {
       stat = genRuntimeAllocate(builder, loc, box, errorManager);
-    else
+      setPinnedToFalse();
+    } else {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+    }
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
@@ -616,13 +632,16 @@ class AllocateStmtHelper {
       genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
-    else if (isSource)
-      stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
-    else
-      stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    } else {
+      if (isSource)
+        stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
+      else
+        stat = genRuntimeAllocate(builder, loc, box, errorManager);
+      setPinnedToFalse();
+    }
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 6479425c58d8b..8b287f859aa76 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -196,3 +196,30 @@ end subroutine
 ! CHECK: %[[BOX:.*]] = fir.load %[[A]]#1 : !fir.ref>>>
 ! CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box>>) -> !fir.heap>
 ! CHECK: fir.freemem %[[BOXADDR]] : !fir.heap>
+
+subroutine setpinned()
+  integer, allocatable :: i(:)
+  logical :: plog
+  allocate(i(10), pinned=plog)
+end
+
+! CHECK-LABEL: func.func @_QPsetpinned()  
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsetpinnedEplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %[[PLOG]] {uniq_name = "_QFsetpinnedEplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>)
+! CHECK: %[[FALSE:.*]] = arith.constant false
+! CHECK: %[[FLASE_CONV:.*]] = fir.convert %[[FALSE]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[FLASE_CONV]] to %[[PLOG_DECL]]#1 : !fir.ref>
+
+subroutine setpinnedpointer()
+  integer, pointer :: i(:)
+  logical :: plog
+  allocate(i(10), pinned=plog)
+end
+
+! CHECK-LABEL: func.func @_QPsetpinnedpointer()
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsetpinnedpointerEplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %[[PLOG]] {uniq_name = "_QFsetpinnedpointerEplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>)
+! CHECK: fir.call @_FortranAPointerAllocate
+! CHECK: %[[FALSE:.*]] = arith.constant false
+! CHECK: %[[FLASE_CONV:.*]] = fir.convert %[[FALSE]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[FLASE_CONV]] to %[[PLOG_DECL]]#1 : !fir.ref>

From 54246a39e4cb06cec7d4bafb014e3cad73b1e4df Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Fri, 3 Jan 2025 16:47:08 -0800
Subject: [PATCH 136/480] [RISCV] Pass VSETVLIInfo by const reference. NFC

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 75985832594d4..1fd130d7e040e 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -627,7 +627,7 @@ class VSETVLIInfo {
     return MI;
   }
 
-  void setAVL(VSETVLIInfo Info) {
+  void setAVL(const VSETVLIInfo &Info) {
     assert(Info.isValid());
     if (Info.isUnknown())
       setUnknown();
@@ -1223,7 +1223,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
 // If we don't use LMUL or the SEW/LMUL ratio, then adjust LMUL so that we
 // maintain the SEW/LMUL ratio. This allows us to eliminate VL toggles in more
 // places.
-static VSETVLIInfo adjustIncoming(VSETVLIInfo PrevInfo, VSETVLIInfo NewInfo,
+static VSETVLIInfo adjustIncoming(const VSETVLIInfo &PrevInfo,
+                                  const VSETVLIInfo &NewInfo,
                                   DemandedFields &Demanded) {
   VSETVLIInfo Info = NewInfo;
 

From 82c0f68c041229eb48a7d018f7aa81d576d456a9 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" 
Date: Fri, 3 Jan 2025 22:03:15 -0300
Subject: [PATCH 137/480] [libc] Remove assert to fix rv32 buildbot

---
 libc/src/unistd/linux/dup2.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index c7c7c1a8ca786..7ffc151a053c9 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -32,7 +32,6 @@ LLVM_LIBC_FUNCTION(int, dup2, (int oldfd, int newfd)) {
     int ret = LIBC_NAMESPACE::syscall_impl(SYS_fcntl, oldfd, F_GETFD);
 #elif defined(SYS_fcntl64)
     // Same as fcntl but can handle large offsets
-    static_assert(sizeof(off_t) == 8);
     int ret = LIBC_NAMESPACE::syscall_impl(SYS_fcntl64, oldfd, F_GETFD);
 #else
 #error "SYS_fcntl and SYS_fcntl64 syscalls not available."

From e3dafa88a8f651825ac65aad9b273983598279dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Fri, 3 Jan 2025 17:35:41 -0800
Subject: [PATCH 138/480] [flang][cuda] Allow GOTO, EXIT, CYCLE and SELECT CASE
 in device procedures (#121612)

---
 flang/lib/Semantics/check-cuda.cpp | 23 +++++++++++++
 flang/test/Semantics/cuf09.cuf     | 53 ++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index d497ac20e7017..d8a5639227648 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -302,6 +302,14 @@ template  class DeviceContextChecker {
             [&](const common::Indirection &x) {
               Check(x.value());
             },
+            [&](const common::Indirection &x) {
+              const auto &caseList{
+                  std::get>(
+                      x.value().t)};
+              for (const parser::CaseConstruct::Case &c : caseList) {
+                Check(std::get(c.t));
+              }
+            },
             [&](const auto &x) {
               if (auto source{parser::GetSource(x)}) {
                 context_.Say(*source,
@@ -347,9 +355,24 @@ template  class DeviceContextChecker {
           hostArray->name());
     }
   }
+  void ErrorInCUFKernel(parser::CharBlock source) {
+    if (IsCUFKernelDo) {
+      context_.Say(
+          source, "Statement may not appear in cuf kernel code"_err_en_US);
+    }
+  }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
+            [&](const common::Indirection &) {
+              ErrorInCUFKernel(source);
+            },
+            [&](const common::Indirection &) {
+              ErrorInCUFKernel(source);
+            },
+            [&](const common::Indirection &) {
+              ErrorInCUFKernel(source);
+            },
             [&](const common::Indirection &) { return; },
             [&](const common::Indirection &) {},
             [&](const common::Indirection &x) {
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index 3307e2a862672..06c9070fcbcd0 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -54,6 +54,59 @@ module m
     print*,threadIdx%x
     stop ! ok
   end subroutine
+
+  attributes(global) subroutine cycletest()
+    integer :: i
+    do i = 1, 10
+      cycle ! ok
+    end do
+  end subroutine
+
+  attributes(global) subroutine gototest()
+    integer :: i
+    goto 10
+    10 print *, "X is negative!" 
+  end subroutine
+
+  attributes(global) subroutine exittest()
+    integer :: i
+    do i = 1, 10
+      if (i == 1) then
+        exit ! ok
+      end if
+    end do
+  end subroutine
+
+  attributes(global) subroutine selectcasetest()
+    integer :: i
+    select case(i)
+    case (1)
+      print*,'main'
+    case default
+      print*, 'default'
+    end select
+  end subroutine
+
+  subroutine host()
+    integer :: i
+    !$cuf kernel do
+    do i = 1, 10
+      !ERROR: Statement may not appear in cuf kernel code
+      cycle
+    end do
+
+    !$cuf kernel do
+    do i = 1, 10
+      if (i == 1) then
+        !ERROR: Statement may not appear in cuf kernel code
+        exit ! ok
+      end if
+
+      !ERROR: Statement may not appear in cuf kernel code
+      goto 10
+      10 print *, "X is negative!"
+    end do
+  end subroutine
 end
 
 program main

From 7c86ab8a18897c434fdb1ee3cd5ff2a71e6aae5a Mon Sep 17 00:00:00 2001
From: Slava Zakharin 
Date: Fri, 3 Jan 2025 18:25:31 -0800
Subject: [PATCH 139/480] [flang] Fixed the missing dependency. (#121370)

My local build with the shared libraries is broken.
I suppose this was introduced by #120374.

`flang/include/flang/Evaluate/constant.h` ends up being included
by `MapInfoFinalization.cpp` via `flang/Lower/DirectivesCommon.h`.
The undefined references are related to `ConstantBase` classes.
---
 flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 4f23b2b970fa4..026889cca238a 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -22,6 +22,7 @@ add_flang_library(FlangOpenMPTransforms
   FIRDialectSupport
   FIRSupport
   FortranCommon
+  FortranEvaluate
   MLIRFuncDialect
   MLIROpenMPDialect
   HLFIRDialect

From a2b9058c392995660956e56c2ac8695a44dc2e4e Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Fri, 3 Jan 2025 19:04:13 -0800
Subject: [PATCH 140/480] [RISCV] Reduce size of CSR lookup tables. NFC
 (#121606)

Instead of storing 3 different names in each row of the table, use a
separate row for each name and use a flag to indicate what type of name
it is. The AltName and DeprecatedName weren't used often enough to
justify storing them as a possibility for every register.

This reduces the .rodata size by 27k and reduces the number of dynamic
relocations since we now only need 1 lookup by name function. The lookup
by name function each contained a ~400 entry table of const char*
pointing to constant strings. Each of those requires a dynamic
relocation.

I also capitalized IsRV32Only in the C++ code to match coding
standards.
---
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 25 ++++++++-----
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |  8 ++--
 .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp   |  2 +
 llvm/lib/Target/RISCV/RISCVSystemOperands.td  | 37 +++++++------------
 4 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4c1fd5aa41e2b..2205c67c2d21b 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1915,6 +1915,8 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
         // Accept an immediate representing a named Sys Reg if it satisfies the
         // the required features.
         for (auto &Reg : Range) {
+          if (Reg.IsAltName || Reg.IsDeprecatedName)
+            continue;
           if (Reg.haveRequiredFeatures(STI->getFeatureBits()))
             return RISCVOperand::createSysReg(Reg.Name, S, Imm);
         }
@@ -1952,22 +1954,27 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
       return ParseStatus::Failure;
 
     const auto *SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
-    if (!SysReg)
-      SysReg = RISCVSysReg::lookupSysRegByAltName(Identifier);
-    if (!SysReg)
-      if ((SysReg = RISCVSysReg::lookupSysRegByDeprecatedName(Identifier)))
-        Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
-                       SysReg->Name + "'");
-
-    // Accept a named Sys Reg if the required features are present.
+
     if (SysReg) {
+      if (SysReg->IsDeprecatedName) {
+        // Lookup the undeprecated name.
+        auto Range = RISCVSysReg::lookupSysRegByEncoding(SysReg->Encoding);
+        for (auto &Reg : Range) {
+          if (Reg.IsAltName || Reg.IsDeprecatedName)
+            continue;
+          Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
+                         Reg.Name + "'");
+        }
+      }
+
+      // Accept a named Sys Reg if the required features are present.
       const auto &FeatureBits = getSTI().getFeatureBits();
       if (!SysReg->haveRequiredFeatures(FeatureBits)) {
         const auto *Feature = llvm::find_if(RISCVFeatureKV, [&](auto Feature) {
           return SysReg->FeaturesRequired[Feature.Value];
         });
         auto ErrorMsg = std::string("system register '") + SysReg->Name + "' ";
-        if (SysReg->isRV32Only && FeatureBits[RISCV::Feature64Bit]) {
+        if (SysReg->IsRV32Only && FeatureBits[RISCV::Feature64Bit]) {
           ErrorMsg += "is RV32 only";
           if (Feature != std::end(RISCVFeatureKV))
             ErrorMsg += " and ";
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7fb5fc7a83130..1c1a8b8009d2c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -454,8 +454,6 @@ int getLoadFPImm(APFloat FPImm);
 namespace RISCVSysReg {
 struct SysReg {
   const char Name[32];
-  const char AltName[32];
-  const char DeprecatedName[32];
   unsigned Encoding;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read, Write, Read-Only.
@@ -467,11 +465,13 @@ struct SysReg {
   // Register number without the privilege bits.
   // unsigned Number;
   FeatureBitset FeaturesRequired;
-  bool isRV32Only;
+  bool IsRV32Only;
+  bool IsAltName;
+  bool IsDeprecatedName;
 
   bool haveRequiredFeatures(const FeatureBitset &ActiveFeatures) const {
     // Not in 32-bit mode.
-    if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
+    if (IsRV32Only && ActiveFeatures[RISCV::Feature64Bit])
       return false;
     // No required feature associated with the system register.
     if (FeaturesRequired.none())
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index d36c0d7238cdc..d5254719b3839 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -121,6 +121,8 @@ void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
   unsigned Imm = MI->getOperand(OpNo).getImm();
   auto Range = RISCVSysReg::lookupSysRegByEncoding(Imm);
   for (auto &Reg : Range) {
+    if (Reg.IsAltName || Reg.IsDeprecatedName)
+      continue;
     if (Reg.haveRequiredFeatures(STI.getFeatureBits())) {
       markup(O, Markup::Register) << Reg.Name;
       return;
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 72275daa1b8d1..39853cf13a920 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -19,12 +19,6 @@ include "llvm/TableGen/SearchableTable.td"
 
 class SysReg op> {
   string Name = name;
-  // A maximum of one alias is supported right now.
-  string AltName = name;
-  // A maximum of one deprecated name is supported right now.  Unlike the
-  // `AltName` alias, a `DeprecatedName` generates a diagnostic when the name is
-  // used to encourage software to migrate away from the name.
-  string DeprecatedName = "";
   bits<12> Encoding = op;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read and Write = 0, 1, 2; Read-Only = 3.
@@ -37,14 +31,16 @@ class SysReg op> {
   // bits<6> Number = op{5 - 0};
   code FeaturesRequired = [{ {} }];
   bit isRV32Only = 0;
+  bit isAltName = 0;
+  bit isDeprecatedName = 0;
 }
 
 def SysRegsList : GenericTable {
   let FilterClass = "SysReg";
   // FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
   let Fields = [
-    "Name", "AltName", "DeprecatedName", "Encoding", "FeaturesRequired",
-    "isRV32Only",
+    "Name", "Encoding", "FeaturesRequired",
+    "isRV32Only", "isAltName", "isDeprecatedName"
   ];
 
   let PrimaryKey = [ "Encoding" ];
@@ -57,16 +53,6 @@ def lookupSysRegByName : SearchIndex {
   let Key = [ "Name" ];
 }
 
-def lookupSysRegByAltName : SearchIndex {
-  let Table = SysRegsList;
-  let Key = [ "AltName" ];
-}
-
-def lookupSysRegByDeprecatedName : SearchIndex {
-  let Table = SysRegsList;
-  let Key = [ "DeprecatedName" ];
-}
-
 // The following CSR encodings match those given in Tables 2.2,
 // 2.3, 2.4, 2.5 and 2.6 in the RISC-V Instruction Set Manual
 // Volume II: Privileged Architecture.
@@ -123,15 +109,17 @@ def : SysReg<"senvcfg", 0x10A>;
 def : SysReg<"sscratch", 0x140>;
 def : SysReg<"sepc", 0x141>;
 def : SysReg<"scause", 0x142>;
-let DeprecatedName = "sbadaddr" in
 def : SysReg<"stval", 0x143>;
+let isDeprecatedName = 1 in
+def : SysReg<"sbadaddr", 0x143>;
 def : SysReg<"sip", 0x144>;
 
 //===----------------------------------------------------------------------===//
 // Supervisor Protection and Translation
 //===----------------------------------------------------------------------===//
-let DeprecatedName = "sptbr" in
 def : SysReg<"satp", 0x180>;
+let isDeprecatedName = 1 in
+def : SysReg<"sptbr", 0x180>;
 
 //===----------------------------------------------------------------------===//
 // Quality-of-Service(QoS) Identifiers (Ssqosid)
@@ -245,8 +233,9 @@ def : SysReg<"mstatush", 0x310>;
 def : SysReg<"mscratch", 0x340>;
 def : SysReg<"mepc", 0x341>;
 def : SysReg<"mcause", 0x342>;
-let DeprecatedName = "mbadaddr" in
 def : SysReg<"mtval", 0x343>;
+let isDeprecatedName = 1 in
+def : SysReg<"mbadaddr", 0x343>;
 def : SysReg<"mip", 0x344>;
 def : SysReg<"mtinst", 0x34A>;
 def : SysReg<"mtval2", 0x34B>;
@@ -298,8 +287,9 @@ foreach i = 3...31 in
 //===----------------------------------------------------------------------===//
 // Machine Counter Setup
 //===----------------------------------------------------------------------===//
-let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
 def : SysReg<"mcountinhibit", 0x320>;
+let isAltName = 1 in
+def : SysReg<"mucounteren", 0x320>;
 
 // mhpmevent3-mhpmevent31 at 0x323-0x33F.
 foreach i = 3...31 in
@@ -336,8 +326,9 @@ def : SysReg<"dpc", 0x7B1>;
 
 // "dscratch" is an alternative name for "dscratch0" which appeared in earlier
 // drafts of the RISC-V debug spec
-let AltName = "dscratch" in
 def : SysReg<"dscratch0", 0x7B2>;
+let isAltName = 1 in
+def : SysReg<"dscratch", 0x7B2>;
 def : SysReg<"dscratch1", 0x7B3>;
 
 //===----------------------------------------------------------------------===//

From dc3cd2e95ee56cdb75f4d0d0742626f912b5c6f3 Mon Sep 17 00:00:00 2001
From: Chandler Carruth 
Date: Fri, 3 Jan 2025 19:23:42 -0800
Subject: [PATCH 141/480] Factor common code for quoting a builtin name
 (#120835)

This shows up in several places in order to match the quoting of other
uses of the same diagnostic. Handling it centrally simplifies the code
and reduces changes if the storage for builtin names changes.

This refactoring is extracted out of #120534 as requested in code
review.
---
 clang/include/clang/Basic/Builtins.h     |  3 +++
 clang/lib/AST/ByteCode/InterpBuiltin.cpp |  4 ++--
 clang/lib/AST/ExprConstant.cpp           | 15 +++++++--------
 clang/lib/Basic/Builtins.cpp             |  4 ++++
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index e27d8ccce7366..63559d977ce6b 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -102,6 +102,9 @@ class Context {
   /// e.g. "__builtin_abs".
   llvm::StringRef getName(unsigned ID) const { return getRecord(ID).Name; }
 
+  /// Return a quoted name for the specified builtin for use in diagnostics.
+  std::string getQuotedName(unsigned ID) const;
+
   /// Get the type descriptor string for the specified builtin.
   const char *getTypeString(unsigned ID) const { return getRecord(ID).Type; }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 731c9290993f1..0d52083b06946 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -155,7 +155,7 @@ static void diagnoseNonConstexprBuiltin(InterpState &S, CodePtr OpPC,
   if (S.getLangOpts().CPlusPlus11)
     S.CCEDiag(Loc, diag::note_constexpr_invalid_function)
         << /*isConstexpr=*/0 << /*isConstructor=*/0
-        << ("'" + S.getASTContext().BuiltinInfo.getName(ID) + "'").str();
+        << S.getASTContext().BuiltinInfo.getQuotedName(ID);
   else
     S.CCEDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
 }
@@ -1977,7 +1977,7 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
                   !isOneByteCharacterType(PtrB.getType()))) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_memcmp_unsupported)
-        << ("'" + ASTCtx.BuiltinInfo.getName(ID) + "'").str() << PtrA.getType()
+        << ASTCtx.BuiltinInfo.getQuotedName(ID) << PtrA.getType()
         << PtrB.getType();
     return false;
   }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index dd75dca647540..e220f69b3a4f5 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9858,7 +9858,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -9903,8 +9903,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     // FIXME: We can compare the bytes in the correct order.
     if (IsRawByte && !isOneByteCharacterType(CharTy)) {
       Info.FFDiag(E, diag::note_constexpr_memchr_unsupported)
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str()
-          << CharTy;
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp) << CharTy;
       return false;
     }
     // Figure out what value we're actually looking for (after converting to
@@ -9966,7 +9965,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13241,7 +13240,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13266,7 +13265,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13321,8 +13320,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
         !(isOneByteCharacterType(CharTy1) && isOneByteCharacterType(CharTy2))) {
       // FIXME: Consider using our bit_cast implementation to support this.
       Info.FFDiag(E, diag::note_constexpr_memcmp_unsupported)
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str()
-          << CharTy1 << CharTy2;
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp) << CharTy1
+          << CharTy2;
       return false;
     }
 
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index 8dd1888db2988..588183788de32 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -163,6 +163,10 @@ void Builtin::Context::initializeBuiltins(IdentifierTable &Table,
   }
 }
 
+std::string Builtin::Context::getQuotedName(unsigned ID) const {
+  return (llvm::Twine("'") + getName(ID) + "'").str();
+}
+
 unsigned Builtin::Context::getRequiredVectorWidth(unsigned ID) const {
   const char *WidthPos = ::strchr(getRecord(ID).Attributes, 'V');
   if (!WidthPos)

From 34f0611bc36db40789823030a3748a8595198719 Mon Sep 17 00:00:00 2001
From: Owen Pan 
Date: Fri, 3 Jan 2025 20:09:39 -0800
Subject: [PATCH 142/480] [clang-format][doc] Minor cleanup

---
 clang/docs/ClangFormatStyleOptions.rst | 9 +++++----
 clang/include/clang/Format/Format.h    | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 7bfaee4e2d35b..637ec23e0abaf 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4661,12 +4661,13 @@ the configuration (without a prefix: ``Auto``).
 .. _KeepEmptyLinesAtEOF:
 
 **KeepEmptyLinesAtEOF** (``Boolean``) :versionbadge:`clang-format 17` :ref:`¶ `
-  This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  This option is **deprecated**. See ``AtEndOfFile`` of ``KeepEmptyLines``.
 
 .. _KeepEmptyLinesAtTheStartOfBlocks:
 
 **KeepEmptyLinesAtTheStartOfBlocks** (``Boolean``) :versionbadge:`clang-format 3.7` :ref:`¶ `
-  This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
+  This option is **deprecated**. See ``AtStartOfBlock`` of
+  ``KeepEmptyLines``.
 
 .. _KeepFormFeed:
 
@@ -6730,8 +6731,8 @@ the configuration (without a prefix: ``Auto``).
 .. _TemplateNames:
 
 **TemplateNames** (``List of Strings``) :versionbadge:`clang-format 20` :ref:`¶ `
-  A vector of non-keyword identifiers that should be interpreted as
-  template names.
+  A vector of non-keyword identifiers that should be interpreted as template
+  names.
 
   A ``<`` after a template name is annotated as a template opener instead of
   a binary operator.
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 9b7a633e0a146..8d41077549690 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3203,11 +3203,12 @@ struct FormatStyle {
   /// \version 19
   KeepEmptyLinesStyle KeepEmptyLines;
 
-  /// This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  /// This option is **deprecated**. See ``AtEndOfFile`` of ``KeepEmptyLines``.
   /// \version 17
   // bool KeepEmptyLinesAtEOF;
 
-  /// This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
+  /// This option is **deprecated**. See ``AtStartOfBlock`` of
+  /// ``KeepEmptyLines``.
   /// \version 3.7
   // bool KeepEmptyLinesAtTheStartOfBlocks;
 
@@ -5042,8 +5043,8 @@ struct FormatStyle {
   /// \version 3.7
   unsigned TabWidth;
 
-  /// A vector of non-keyword identifiers that should be interpreted as
-  /// template names.
+  /// A vector of non-keyword identifiers that should be interpreted as template
+  /// names.
   ///
   /// A ``<`` after a template name is annotated as a template opener instead of
   /// a binary operator.

From aa0f3343a60c6132d9f6adfb8f62234a95519918 Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Fri, 3 Jan 2025 19:51:28 -0800
Subject: [PATCH 143/480] [TableGen] Add 'final' to all of the *Init classes.

Classes that used TrailingObjects were already 'final'. Add to the
rest for consistency.
---
 llvm/include/llvm/TableGen/Record.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 81a9257425783..3402f1957a1c1 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -448,7 +448,7 @@ class TypedInit : public Init {
 };
 
 /// '?' - Represents an uninitialized value.
-class UnsetInit : public Init {
+class UnsetInit final : public Init {
   friend detail::RecordKeeperImpl;
 
   /// The record keeper that initialized this Init.
@@ -486,7 +486,7 @@ class UnsetInit : public Init {
 
 // Represent an argument.
 using ArgAuxType = std::variant;
-class ArgumentInit : public Init, public FoldingSetNode {
+class ArgumentInit final : public Init, public FoldingSetNode {
 public:
   enum Kind {
     Positional,
@@ -638,7 +638,7 @@ class BitsInit final : public TypedInit,
 };
 
 /// '7' - Represent an initialization by a literal integer value.
-class IntInit : public TypedInit {
+class IntInit final : public TypedInit {
   int64_t Value;
 
   explicit IntInit(RecordKeeper &RK, int64_t V)
@@ -669,7 +669,7 @@ class IntInit : public TypedInit {
 };
 
 /// "anonymous_n" - Represent an anonymous record name
-class AnonymousNameInit : public TypedInit {
+class AnonymousNameInit final : public TypedInit {
   unsigned Value;
 
   explicit AnonymousNameInit(RecordKeeper &RK, unsigned V)
@@ -699,7 +699,7 @@ class AnonymousNameInit : public TypedInit {
 };
 
 /// "foo" - Represent an initialization by a string value.
-class StringInit : public TypedInit {
+class StringInit final : public TypedInit {
 public:
   enum StringFormat {
     SF_String, // Format as "text"
@@ -845,7 +845,7 @@ class OpInit : public TypedInit {
 
 /// !op (X) - Transform an init.
 ///
-class UnOpInit : public OpInit, public FoldingSetNode {
+class UnOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum UnaryOp : uint8_t {
     TOLOWER,
@@ -908,7 +908,7 @@ class UnOpInit : public OpInit, public FoldingSetNode {
 };
 
 /// !op (X, Y) - Combine two inits.
-class BinOpInit : public OpInit, public FoldingSetNode {
+class BinOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum BinaryOp : uint8_t {
     ADD,
@@ -995,7 +995,7 @@ class BinOpInit : public OpInit, public FoldingSetNode {
 };
 
 /// !op (X, Y, Z) - Combine two inits.
-class TernOpInit : public OpInit, public FoldingSetNode {
+class TernOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum TernaryOp : uint8_t {
     SUBST,
@@ -1144,7 +1144,7 @@ class CondOpInit final : public TypedInit,
 };
 
 /// !foldl (a, b, expr, start, lst) - Fold over a list.
-class FoldOpInit : public TypedInit, public FoldingSetNode {
+class FoldOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const Init *Start, *List, *A, *B, *Expr;
 
@@ -1179,7 +1179,7 @@ class FoldOpInit : public TypedInit, public FoldingSetNode {
 };
 
 /// !isa(expr) - Dynamically determine the type of an expression.
-class IsAOpInit : public TypedInit, public FoldingSetNode {
+class IsAOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const RecTy *CheckType;
   const Init *Expr;
@@ -1213,7 +1213,7 @@ class IsAOpInit : public TypedInit, public FoldingSetNode {
 
 /// !exists(expr) - Dynamically determine if a record of `type` named
 /// `expr` exists.
-class ExistsOpInit : public TypedInit, public FoldingSetNode {
+class ExistsOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const RecTy *CheckType;
   const Init *Expr;
@@ -1246,7 +1246,7 @@ class ExistsOpInit : public TypedInit, public FoldingSetNode {
 };
 
 /// 'Opcode' - Represent a reference to an entire variable object.
-class VarInit : public TypedInit {
+class VarInit final : public TypedInit {
   const Init *VarName;
 
   explicit VarInit(const Init *VN, const RecTy *T)
@@ -1320,7 +1320,7 @@ class VarBitInit final : public TypedInit {
 };
 
 /// AL - Represent a reference to a 'def' in the description
-class DefInit : public TypedInit {
+class DefInit final : public TypedInit {
   friend class Record;
 
   const Record *Def;
@@ -1409,7 +1409,7 @@ class VarDefInit final
 };
 
 /// X.Y - Represent a reference to a subfield of a variable
-class FieldInit : public TypedInit {
+class FieldInit final : public TypedInit {
   const Init *Rec;             // Record we are referring to
   const StringInit *FieldName; // Field we are accessing
 

From 2d424765f496410d6ab95a80c90d2eda933d66d4 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 4 Jan 2025 09:12:03 +0100
Subject: [PATCH 144/480] [mlir][IR][NFC] `DominanceInfo`: Share same impl for
 block/op dominance (#115587)

The `properlyDominates` implementations for blocks and ops are very
similar. This commit replaces them with a single implementation that
operates on block iterators. That implementation can be used to
implement both `properlyDominates` variants.

Before:
```c++
template 
bool DominanceInfoBase::properlyDominatesImpl(Block *a,
                                                         Block *b) const;
template 
bool DominanceInfoBase::properlyDominatesImpl(
    Operation *a, Operation *b, bool enclosingOpOk) const;
```

After:
```c++
template 
bool DominanceInfoBase::properlyDominatesImpl(
    Block *aBlock, Block::iterator aIt, Block *bBlock, Block::iterator bIt,
    bool enclosingOk) const;
```

Note: A subsequent commit will add a new public `properlyDominates`
overload that accepts block iterators. That functionality can then be
used to find a valid insertion point at which a range of values is
defined (by utilizing post dominance).
---
 mlir/include/mlir/IR/Dominance.h |  28 +++----
 mlir/lib/IR/Dominance.cpp        | 124 ++++++++++++++++++++-----------
 2 files changed, 92 insertions(+), 60 deletions(-)

diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h
index 16d17b9c0f3d0..63504cad211a4 100644
--- a/mlir/include/mlir/IR/Dominance.h
+++ b/mlir/include/mlir/IR/Dominance.h
@@ -113,12 +113,12 @@ class DominanceInfoBase {
   llvm::PointerIntPair
   getDominanceInfo(Region *region, bool needsDomTree) const;
 
-  /// Return "true" if the specified block A properly (post)dominates block B.
-  bool properlyDominatesImpl(Block *a, Block *b) const;
-
-  /// Return "true" if the specified op A properly (post)dominates op B.
-  bool properlyDominatesImpl(Operation *a, Operation *b,
-                             bool enclosingOpOk = true) const;
+  /// Return "true" if block iterator A properly (post)dominates block iterator
+  /// B. If `enclosingOk` is set, A is considered to (post)dominate B if A
+  /// encloses B.
+  bool properlyDominatesImpl(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                             Block::iterator bIt,
+                             bool enclosingOk = true) const;
 
   /// A mapping of regions to their base dominator tree and a cached
   /// "hasSSADominance" bit. This map does not contain dominator trees for
@@ -151,9 +151,7 @@ class DominanceInfo : public detail::DominanceInfoBase {
   /// The `enclosingOpOk` flag says whether we should return true if the B op
   /// is enclosed by a region on A.
   bool properlyDominates(Operation *a, Operation *b,
-                         bool enclosingOpOk = true) const {
-    return super::properlyDominatesImpl(a, b, enclosingOpOk);
-  }
+                         bool enclosingOpOk = true) const;
 
   /// Return true if operation A dominates operation B, i.e. if A and B are the
   /// same operation or A properly dominates B.
@@ -188,9 +186,7 @@ class DominanceInfo : public detail::DominanceInfoBase {
   /// Graph regions have only a single block. To be consistent with "proper
   /// dominance" of ops, the single block is considered to properly dominate
   /// itself in a graph region.
-  bool properlyDominates(Block *a, Block *b) const {
-    return super::properlyDominatesImpl(a, b);
-  }
+  bool properlyDominates(Block *a, Block *b) const;
 };
 
 /// A class for computing basic postdominance information.
@@ -200,9 +196,7 @@ class PostDominanceInfo : public detail::DominanceInfoBase {
 
   /// Return true if operation A properly postdominates operation B.
   bool properlyPostDominates(Operation *a, Operation *b,
-                             bool enclosingOpOk = true) const {
-    return super::properlyDominatesImpl(a, b, enclosingOpOk);
-  }
+                             bool enclosingOpOk = true) const;
 
   /// Return true if operation A postdominates operation B.
   bool postDominates(Operation *a, Operation *b) const {
@@ -210,9 +204,7 @@ class PostDominanceInfo : public detail::DominanceInfoBase {
   }
 
   /// Return true if the specified block A properly postdominates block B.
-  bool properlyPostDominates(Block *a, Block *b) const {
-    return super::properlyDominatesImpl(a, b);
-  }
+  bool properlyPostDominates(Block *a, Block *b) const;
 
   /// Return true if the specified block A postdominates block B.
   bool postDominates(Block *a, Block *b) const {
diff --git a/mlir/lib/IR/Dominance.cpp b/mlir/lib/IR/Dominance.cpp
index 406e0f2d62d64..1c54e09d29b9b 100644
--- a/mlir/lib/IR/Dominance.cpp
+++ b/mlir/lib/IR/Dominance.cpp
@@ -213,61 +213,73 @@ DominanceInfoBase::findNearestCommonDominator(Block *a,
   return getDomTree(a->getParent()).findNearestCommonDominator(a, b);
 }
 
-/// Return true if the specified block A properly dominates block B.
-template 
-bool DominanceInfoBase::properlyDominatesImpl(Block *a,
-                                                         Block *b) const {
-  assert(a && b && "null blocks not allowed");
+/// Returns the given block iterator if it lies within the region region.
+/// Otherwise, otherwise finds the ancestor of the given block iterator that
+/// lies within the given region. Returns and "empty" iterator if the latter
+/// fails.
+///
+/// Note: This is a variant of Region::findAncestorOpInRegion that operates on
+/// block iterators instead of ops.
+static std::pair
+findAncestorIteratorInRegion(Region *r, Block *b, Block::iterator it) {
+  // Case 1: The iterator lies within the region region.
+  if (b->getParent() == r)
+    return std::make_pair(b, it);
+
+  // Otherwise: Find ancestor iterator. Bail if we run out of parent ops.
+  Operation *parentOp = b->getParentOp();
+  if (!parentOp)
+    return std::make_pair(static_cast(nullptr), Block::iterator());
+  Operation *op = r->findAncestorOpInRegion(*parentOp);
+  if (!op)
+    return std::make_pair(static_cast(nullptr), Block::iterator());
+  return std::make_pair(op->getBlock(), op->getIterator());
+}
 
-  // A block dominates, but does not properly dominate, itself unless this
-  // is a graph region.
+/// Given two iterators into the same block, return "true" if `a` is before `b.
+/// Note: This is a variant of Operation::isBeforeInBlock that operates on
+/// block iterators instead of ops.
+static bool isBeforeInBlock(Block *block, Block::iterator a,
+                            Block::iterator b) {
   if (a == b)
-    return !hasSSADominance(a);
-
-  // If both blocks are not in the same region, `a` properly dominates `b` if
-  // `b` is defined in an operation region that (recursively) ends up being
-  // dominated by `a`. Walk up the list of containers enclosing B.
-  Region *regionA = a->getParent();
-  if (regionA != b->getParent()) {
-    b = regionA ? regionA->findAncestorBlockInRegion(*b) : nullptr;
-    // If we could not find a valid block b then it is a not a dominator.
-    if (!b)
-      return false;
-
-    // Check to see if the ancestor of `b` is the same block as `a`.  A properly
-    // dominates B if it contains an op that contains the B block.
-    if (a == b)
-      return true;
-  }
-
-  // Otherwise, they are two different blocks in the same region, use DomTree.
-  return getDomTree(regionA).properlyDominates(a, b);
+    return false;
+  if (a == block->end())
+    return false;
+  if (b == block->end())
+    return true;
+  return a->isBeforeInBlock(&*b);
 }
 
 template 
 bool DominanceInfoBase::properlyDominatesImpl(
-    Operation *a, Operation *b, bool enclosingOpOk) const {
-  Block *aBlock = a->getBlock(), *bBlock = b->getBlock();
-  assert(aBlock && bBlock && "operations must be in a block");
+    Block *aBlock, Block::iterator aIt, Block *bBlock, Block::iterator bIt,
+    bool enclosingOk) const {
+  assert(aBlock && bBlock && "expected non-null blocks");
 
-  // An operation (pos)dominates, but does not properly (pos)dominate, itself
-  // unless this is a graph region.
-  if (a == b)
+  // A block iterator (post)dominates, but does not properly (post)dominate,
+  // itself unless this is a graph region.
+  if (aBlock == bBlock && aIt == bIt)
     return !hasSSADominance(aBlock);
 
-  // If these ops are in different regions, then normalize one into the other.
+  // If the iterators are in different regions, then normalize one into the
+  // other.
   Region *aRegion = aBlock->getParent();
   if (aRegion != bBlock->getParent()) {
-    // Scoot up b's region tree until we find an operation in A's region that
+    // Scoot up b's region tree until we find a location in A's region that
     // encloses it.  If this fails, then we know there is no (post)dom relation.
-    b = aRegion ? aRegion->findAncestorOpInRegion(*b) : nullptr;
-    if (!b)
+    if (!aRegion) {
+      bBlock = nullptr;
+      bIt = Block::iterator();
+    } else {
+      std::tie(bBlock, bIt) =
+          findAncestorIteratorInRegion(aRegion, bBlock, bIt);
+    }
+    if (!bBlock)
       return false;
-    bBlock = b->getBlock();
-    assert(bBlock->getParent() == aRegion);
+    assert(bBlock->getParent() == aRegion && "expected block in regionA");
 
     // If 'a' encloses 'b', then we consider it to (post)dominate.
-    if (a == b && enclosingOpOk)
+    if (aBlock == bBlock && aIt == bIt && enclosingOk)
       return true;
   }
 
@@ -279,9 +291,9 @@ bool DominanceInfoBase::properlyDominatesImpl(
     if (!hasSSADominance(aBlock))
       return true;
     if constexpr (IsPostDom) {
-      return b->isBeforeInBlock(a);
+      return isBeforeInBlock(aBlock, bIt, aIt);
     } else {
-      return a->isBeforeInBlock(b);
+      return isBeforeInBlock(aBlock, aIt, bIt);
     }
   }
 
@@ -309,6 +321,18 @@ template class detail::DominanceInfoBase;
 // DominanceInfo
 //===----------------------------------------------------------------------===//
 
+bool DominanceInfo::properlyDominates(Operation *a, Operation *b,
+                                      bool enclosingOpOk) const {
+  return super::properlyDominatesImpl(a->getBlock(), a->getIterator(),
+                                      b->getBlock(), b->getIterator(),
+                                      enclosingOpOk);
+}
+
+bool DominanceInfo::properlyDominates(Block *a, Block *b) const {
+  return super::properlyDominatesImpl(a, a->begin(), b, b->begin(),
+                                      /*enclosingOk=*/true);
+}
+
 /// Return true if the `a` value properly dominates operation `b`, i.e if the
 /// operation that defines `a` properlyDominates `b` and the operation that
 /// defines `a` does not contain `b`.
@@ -322,3 +346,19 @@ bool DominanceInfo::properlyDominates(Value a, Operation *b) const {
   // `b`, but `a` does not itself enclose `b` in one of its regions.
   return properlyDominates(a.getDefiningOp(), b, /*enclosingOpOk=*/false);
 }
+
+//===----------------------------------------------------------------------===//
+// PostDominanceInfo
+//===----------------------------------------------------------------------===//
+
+bool PostDominanceInfo::properlyPostDominates(Operation *a, Operation *b,
+                                              bool enclosingOpOk) const {
+  return super::properlyDominatesImpl(a->getBlock(), a->getIterator(),
+                                      b->getBlock(), b->getIterator(),
+                                      enclosingOpOk);
+}
+
+bool PostDominanceInfo::properlyPostDominates(Block *a, Block *b) const {
+  return super::properlyDominatesImpl(a, a->end(), b, b->end(),
+                                      /*enclosingOk=*/true);
+}

From 95c5c5d4badf7c2128d098be325356e15c2197be Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 4 Jan 2025 09:23:15 +0100
Subject: [PATCH 145/480] [mlir][Transforms][NFC] Use `DominanceInfo` to
 compute materialization insertion point (#120746)

In the dialect conversion driver, use `DominanceInfo` to compute a
suitable insertion point for N:1 source materializations.
---
 mlir/include/mlir/IR/Dominance.h              | 23 ++++++
 .../Transforms/Utils/DialectConversion.cpp    | 70 +++++--------------
 2 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h
index 63504cad211a4..9e1254c1dfe1e 100644
--- a/mlir/include/mlir/IR/Dominance.h
+++ b/mlir/include/mlir/IR/Dominance.h
@@ -187,6 +187,17 @@ class DominanceInfo : public detail::DominanceInfoBase {
   /// dominance" of ops, the single block is considered to properly dominate
   /// itself in a graph region.
   bool properlyDominates(Block *a, Block *b) const;
+
+  bool properlyDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                         Block::iterator bIt, bool enclosingOk = true) const {
+    return super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
+
+  bool dominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                 Block::iterator bIt, bool enclosingOk = true) const {
+    return (aBlock == bBlock && aIt == bIt) ||
+           super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
 };
 
 /// A class for computing basic postdominance information.
@@ -210,6 +221,18 @@ class PostDominanceInfo : public detail::DominanceInfoBase {
   bool postDominates(Block *a, Block *b) const {
     return a == b || properlyPostDominates(a, b);
   }
+
+  bool properlyPostDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                             Block::iterator bIt,
+                             bool enclosingOk = true) const {
+    return super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
+
+  bool postDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                     Block::iterator bIt, bool enclosingOk = true) const {
+    return (aBlock == bBlock && aIt == bIt) ||
+           super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 6c3863e4c7f66..1e689cd96ae71 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -54,55 +54,6 @@ static void logFailure(llvm::ScopedPrinter &os, StringRef fmt, Args &&...args) {
   });
 }
 
-/// Given two insertion points in the same block, choose the later one.
-static OpBuilder::InsertPoint
-chooseLaterInsertPointInBlock(OpBuilder::InsertPoint a,
-                              OpBuilder::InsertPoint b) {
-  assert(a.getBlock() == b.getBlock() && "expected same block");
-  Block *block = a.getBlock();
-  if (a.getPoint() == block->begin())
-    return b;
-  if (b.getPoint() == block->begin())
-    return a;
-  if (a.getPoint()->isBeforeInBlock(&*b.getPoint()))
-    return b;
-  return a;
-}
-
-/// Helper function that chooses the insertion point among the two given ones
-/// that is later.
-// TODO: Extend DominanceInfo API to work with block iterators.
-static OpBuilder::InsertPoint chooseLaterInsertPoint(OpBuilder::InsertPoint a,
-                                                     OpBuilder::InsertPoint b) {
-  // Case 1: Fast path: Same block. This is the most common case.
-  if (LLVM_LIKELY(a.getBlock() == b.getBlock()))
-    return chooseLaterInsertPointInBlock(a, b);
-
-  // Case 2: Different block, but same region.
-  if (a.getBlock()->getParent() == b.getBlock()->getParent()) {
-    DominanceInfo domInfo;
-    if (domInfo.properlyDominates(a.getBlock(), b.getBlock()))
-      return b;
-    if (domInfo.properlyDominates(b.getBlock(), a.getBlock()))
-      return a;
-    // Neither of the two blocks dominante each other.
-    llvm_unreachable("unable to find valid insertion point");
-  }
-
-  // Case 3: b's region contains a: choose a.
-  if (b.getBlock()->getParent()->findAncestorOpInRegion(
-          *a.getPoint()->getParentOp()))
-    return a;
-
-  // Case 4: a's region contains b: choose b.
-  if (a.getBlock()->getParent()->findAncestorOpInRegion(
-          *b.getPoint()->getParentOp()))
-    return b;
-
-  // Neither of the two operations contain each other.
-  llvm_unreachable("unable to find valid insertion point");
-}
-
 /// Helper function that computes an insertion point where the given value is
 /// defined and can be used without a dominance violation.
 static OpBuilder::InsertPoint computeInsertPoint(Value value) {
@@ -117,9 +68,26 @@ static OpBuilder::InsertPoint computeInsertPoint(Value value) {
 /// defined and can be used without a dominance violation.
 static OpBuilder::InsertPoint computeInsertPoint(ArrayRef vals) {
   assert(!vals.empty() && "expected at least one value");
+  DominanceInfo domInfo;
   OpBuilder::InsertPoint pt = computeInsertPoint(vals.front());
-  for (Value v : vals.drop_front())
-    pt = chooseLaterInsertPoint(pt, computeInsertPoint(v));
+  for (Value v : vals.drop_front()) {
+    // Choose the "later" insertion point.
+    OpBuilder::InsertPoint nextPt = computeInsertPoint(v);
+    if (domInfo.dominates(pt.getBlock(), pt.getPoint(), nextPt.getBlock(),
+                          nextPt.getPoint())) {
+      // pt is before nextPt => choose nextPt.
+      pt = nextPt;
+    } else {
+#ifndef NDEBUG
+      // nextPt should be before pt => choose pt.
+      // If pt, nextPt are no dominance relationship, then there is no valid
+      // insertion point at which all given values are defined.
+      bool dom = domInfo.dominates(nextPt.getBlock(), nextPt.getPoint(),
+                                   pt.getBlock(), pt.getPoint());
+      assert(dom && "unable to find valid insertion point");
+#endif // NDEBUG
+    }
+  }
   return pt;
 }
 

From fac46469977da9c4e9c6eeaac21103c971190577 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng 
Date: Sat, 4 Jan 2025 17:23:57 +0800
Subject: [PATCH 146/480] [InstCombine] Check no wrap flags before folding icmp
 of GEPs with same indices (#121628)

Alive2: https://alive2.llvm.org/ce/z/Dr3Sbe
Closes https://github.com/llvm/llvm-project/issues/121581.
---
 .../InstCombine/InstCombineCompares.cpp       |  6 ++-
 llvm/test/Transforms/InstCombine/icmp-gep.ll  | 48 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d6fdade25559f..8b23583c51063 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -747,6 +747,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                         ConstantExpr::getPointerBitCastOrAddrSpaceCast(
                             cast(RHS), Base->getType()));
   } else if (GEPOperator *GEPRHS = dyn_cast(RHS)) {
+    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
+
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
     if (PtrBase != GEPRHS->getOperand(0)) {
@@ -764,7 +766,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       // If all indices are the same, just compare the base pointers.
       Type *BaseType = GEPLHS->getOperand(0)->getType();
-      if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
+      if (IndicesTheSame &&
+          CmpInst::makeCmpResultType(BaseType) == I.getType() && CanFold(NW))
         return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
 
       // If we're comparing GEPs with two base pointers that only differ in type
@@ -804,7 +807,6 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
     }
 
-    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
         GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
       // If the GEPs only differ by one index, compare it.
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index f9b90c224d832..7f8f1ae73948d 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -709,3 +709,51 @@ define i1 @pointer_icmp_aligned_with_offset_negative(ptr align 8 %a, ptr align 8
   %cmp = icmp eq ptr %gep, %a2
   ret i1 %cmp
 }
+
+define i1 @gep_diff_base_same_indices(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr i8, ptr %x, i64 %z
+  %gep2 = getelementptr i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nuw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nusw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw_nusw(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr nuw i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr nusw i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}

From 2529a8df53af9bc6cecfd6c83404ffa5e89e3370 Mon Sep 17 00:00:00 2001
From: Chandler Carruth 
Date: Sat, 4 Jan 2025 02:23:54 -0800
Subject: [PATCH 147/480] Mechanically port bulk of x86 builtins to TableGen
 (#120831)

The goal is to make incremental (if small) progress towards fully
TableGen'ed builtins, and to unblock #120534 by gaining access to more
powerful TableGen-based representations.

The bulk `.td` file addition was generated with the help of a very rough
Python script. That script made no attempt to be robust or reusable, it
specifically handled only the cases in the X86 `.def` file.

Four entries from the `.def` file were not handled automatically as they
used `BUILTIN` rather than `TARGET_BUILTIN`. These were ported by hand
to an empty-feature `TargetBuiltin` entry, which seems like a better
match.

For all the automatically ported entries, the results were compared by
sorting and diffing the `.def` file and the generated `.inc` file. The
only differences were:

- Different horizontal whitespace

- Additional entries that had already been ported to the `.td` file.

- More systematically using `Oi` instead of `LLi` for the type `long
  long int` in the fully general `__builtin_ia32_...` builtins for OpenCL
  support. The `.def` file was only partially moved to this it seems, and
  the systematic migration has updated a few missed builtins.
---
 clang/include/clang/Basic/BuiltinsBase.td     |   13 +-
 clang/include/clang/Basic/BuiltinsX86.def     | 2225 -------
 clang/include/clang/Basic/BuiltinsX86.td      | 5390 +++++++++++++++++
 clang/include/clang/Basic/TargetBuiltins.h    |    2 -
 clang/lib/Basic/Targets/X86.cpp               |    8 -
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp |   28 +-
 6 files changed, 5427 insertions(+), 2239 deletions(-)
 delete mode 100644 clang/include/clang/Basic/BuiltinsX86.def

diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
index cff182f3f282c..1a1096d41da40 100644
--- a/clang/include/clang/Basic/BuiltinsBase.td
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -88,6 +88,8 @@ class Builtin {
   // On some platforms, some functions are actually macros. In that case we need
   // to #undef them.
   bit RequiresUndef = 0;
+  // Enables builtins to generate `long long` outside of OpenCL and `long` inside.
+  bit EnableOpenCLLong = 0;
 }
 
 class CustomEntry {
@@ -95,9 +97,6 @@ class CustomEntry {
 }
 
 class AtomicBuiltin : Builtin;
-class TargetBuiltin : Builtin {
-  string Features = "";
-}
 
 class LibBuiltin : Builtin {
   string Header = header;
@@ -122,6 +121,14 @@ class OCL_DSELangBuiltin : LangBuiltin<"OCL_DSE">;
 class OCL_GASLangBuiltin : LangBuiltin<"OCL_GAS">;
 class OCLLangBuiltin : LangBuiltin<"ALL_OCL_LANGUAGES">;
 
+class TargetBuiltin : Builtin {
+  string Features = "";
+}
+class TargetLibBuiltin : TargetBuiltin {
+  string Header;
+  string Languages = "ALL_LANGUAGES";
+}
+
 class Template substitutions,
                list affixes,
                bit as_prefix = 0> {
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
deleted file mode 100644
index 352b3a9ec594a..0000000000000
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ /dev/null
@@ -1,2225 +0,0 @@
-//===--- BuiltinsX86.def - X86 Builtin function database --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the X86-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-// FIXME: Ideally we would be able to pull this information from what
-// LLVM already knows about X86 builtins. We need to match the LLVM
-// definition anyway, since code generation will lower to the
-// intrinsic if one exists.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
-#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// MMX
-//
-// All MMX instructions will be generated via builtins. Any MMX vector
-// types (<1 x i64>, <2 x i32>, etc.) that aren't used by these builtins will be
-// expanded by the back-end.
-// FIXME: _mm_prefetch must be a built-in because it takes a compile-time constant
-// argument and our prior approach of using a #define to the current built-in
-// doesn't work in the presence of re-declaration of _mm_prefetch for windows.
-TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
-
-// SSE intrinsics.
-
-TARGET_BUILTIN(__builtin_ia32_ldmxcsr, "vUi", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_setcsr, "vUi", "nh",XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_stmxcsr, "Ui", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_getcsr, "Ui", "nh", XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvttss2si, "iV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "nV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sfence, "v", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_sfence, "v", "nh", XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rsqrtps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rsqrtss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sqrtps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_shufps, "V4fV4fV4fIi", "ncV:128:", "sse")
-
-TARGET_BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "nV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movmskpd, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "n", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshufd, "V4iV4iIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshuflw, "V8sV8sIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshufhw, "V8sV8sIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2OiV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_shufpd, "V2dV2dV2dIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq, "V2OiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps, "V4fV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq, "V4iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2si, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttsd2si, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2ss, "V4fV4fV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq, "V4iV4f", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq, "V4iV4f", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_clflush, "vvC*", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_clflush, "vvC*", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_lfence, "v", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_lfence, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_mfence, "v", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_mfence, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_pause, "v", "n", "")
-TARGET_HEADER_BUILTIN(_mm_pause, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_pmuludq128, "V2OiV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psraw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrad128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrld128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlq128, "V2OiV2OiV2Oi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslld128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllq128, "V2OiV2OiV2Oi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllwi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslldi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllqi128, "V2OiV2Oii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlwi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrldi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlqi128, "V2OiV2Oii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrawi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psradi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd128, "V4iV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi128_byteshift, "V2OiV2OiIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi128_byteshift, "V2OiV2OiIi", "ncV:128:", "sse2")
-
-TARGET_BUILTIN(__builtin_ia32_monitor, "vvC*UiUi", "n", "sse3")
-TARGET_BUILTIN(__builtin_ia32_mwait, "vUiUi", "n", "sse3")
-TARGET_BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "nV:128:", "sse3")
-
-TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "ncV:128:", "ssse3")
-
-TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "ncV:128:", "sse4.1")
-
-TARGET_BUILTIN(__builtin_ia32_pmuldq128, "V2OiV4iV4i", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundps, "V4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundss, "V4fV4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundsd, "V2dV2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundpd, "V2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_dpps, "V4fV4fV4fIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_dppd, "V2dV2dV2dIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestz128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestc128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestnzc128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_mpsadbw128, "V16cV16cV16cIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_phminposuw128, "V8sV8s", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v16qi, "cV16cIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v16qi, "V16cV16ccIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4si, "V4iV4iiIi", "ncV:128:", "sse4.1")
-
-// SSE 4.2
-TARGET_BUILTIN(__builtin_ia32_pcmpistrm128, "V16cV16cV16cIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistri128, "iV16cV16cIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestrm128, "V16cV16ciV16ciIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestri128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-
-TARGET_BUILTIN(__builtin_ia32_pcmpistria128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistric128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistrio128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistris128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistriz128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestria128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestric128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestrio128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestris128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestriz128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-
-TARGET_BUILTIN(__builtin_ia32_crc32qi, "UiUiUc", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_crc32hi, "UiUiUs", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_crc32si, "UiUiUi", "nc", "crc32")
-
-// SSE4a
-TARGET_BUILTIN(__builtin_ia32_extrqi, "V2OiV2OiIcIc", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_extrq, "V2OiV2OiV16c", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_insertqi, "V2OiV2OiV2OiIcIc", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_insertq, "V2OiV2OiV2Oi", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_movntsd, "vd*V2d", "nV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_movntss, "vf*V4f", "nV:128:", "sse4a")
-
-// AES
-TARGET_BUILTIN(__builtin_ia32_aesenc128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesdec128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesimc128, "V2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aeskeygenassist128, "V2OiV2OiIc", "ncV:128:", "aes")
-
-// VAES
-TARGET_BUILTIN(__builtin_ia32_aesenc256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenc512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdec256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdec512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-
-// GFNI
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,evex512,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,evex512,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v16qi, "V16cV16cV16c", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v32qi, "V32cV32cV32c", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v64qi, "V64cV64cV64c", "ncV:512:", "avx512f,evex512,gfni")
-
-// CLMUL
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq128, "V2OiV2OiV2OiIc", "ncV:128:", "pclmul")
-
-// VPCLMULQDQ
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq256, "V4OiV4OiV4OiIc", "ncV:256:", "vpclmulqdq")
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq512, "V8OiV8OiV8OiIc", "ncV:512:", "avx512f,evex512,vpclmulqdq")
-
-// AVX
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_shufpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_shufps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq256, "V8iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256, "V4iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq256, "V4iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq256, "V8iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_pd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_ps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_si256, "V8iV8iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd, "V2dV2dIi", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilps, "V4fV4fIi", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd256, "V4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilps256, "V8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_pd256, "V4dV4dV2dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_ps256, "V8fV8fV4fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_si256, "V8iV8iV4iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_sqrtpd256, "V4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_sqrtps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rsqrtps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rcpps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_roundpd256, "V4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_roundps256, "V8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestz256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestc256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestnzc256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_movmskpd256, "iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_movmskps256, "iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vzeroall, "v", "n", "avx")
-TARGET_BUILTIN(__builtin_ia32_vzeroupper, "v", "n", "avx")
-TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2Oi", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4Oi", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadps256, "V8fV8fC*V8i", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstorepd, "vV2d*V2OiV2d", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstoreps, "vV4f*V4iV4f", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstorepd256, "vV4d*V4OiV4d", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstoreps256, "vV8f*V8iV8f", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v32qi, "cV32cIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v16hi, "sV16sIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v8si, "iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v32qi, "V32cV32ccIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v16hi, "V16sV16ssIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v8si, "V8iV8iiIi", "ncV:256:", "avx")
-
-// AVX2
-TARGET_BUILTIN(__builtin_ia32_mpsadbw256, "V32cV32cV32cIc", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packsswb256, "V32cV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packssdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packuswb256, "V32cV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packusdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw256, "V16sV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd256, "V8iV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb256, "iV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmuldq256, "V4OiV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmuludq256, "V4OiV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psadbw256, "V4OiV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufd256, "V8iV8iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshuflw256, "V16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufhw256, "V16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllwi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi256_byteshift, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslld256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllqi256, "V4OiV4Oii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllq256, "V4OiV4OiV2Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrawi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psraw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psradi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrad256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi256_byteshift, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlwi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4OiV4Oii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4OiV4OiV2Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permdf256, "V4dV4dIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permti256, "V4OiV4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permdi256, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_extract128i256, "V2OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_insert128i256, "V4OiV4OiV2OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadd256, "V8iV8iC*V8i", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadq256, "V4OiV4OiC*V4Oi", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadd, "V4iV4iC*V4i", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadq, "V2OiV2OiC*V2Oi", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstored256, "vV8i*V8iV8i", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstoreq256, "vV4Oi*V4OiV4Oi", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstored, "vV4i*V4iV4i", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstoreq, "vV2Oi*V2OiV2Oi", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv4di, "V4OiV4OiV4Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv2di, "V2OiV2OiV2Oi", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrav8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrav4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv4di, "V4OiV4OiV4Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv2di, "V2OiV2OiV2Oi", "ncV:128:", "avx2")
-
-// GATHER
-TARGET_BUILTIN(__builtin_ia32_gatherd_pd, "V2dV2ddC*V4iV2dIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_pd256, "V4dV4ddC*V4iV4dIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_pd, "V2dV2ddC*V2OiV2dIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_pd256, "V4dV4ddC*V4OiV4dIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_ps, "V4fV4ffC*V4iV4fIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_ps256, "V8fV8ffC*V8iV8fIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_ps, "V4fV4ffC*V2OiV4fIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_ps256, "V4fV4ffC*V4OiV4fIc", "nV:256:", "avx2")
-
-TARGET_BUILTIN(__builtin_ia32_gatherd_q, "V2OiV2OiOiC*V4iV2OiIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_q256, "V4OiV4OiOiC*V4iV4OiIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_q, "V2OiV2OiOiC*V2OiV2OiIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_q256, "V4OiV4OiOiC*V4OiV4OiIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_d, "V4iV4iiC*V4iV4iIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_d256, "V8iV8iiC*V8iV8iIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_d, "V4iV4iiC*V2OiV4iIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_d256, "V4iV4iiC*V4OiV4iIc", "nV:256:", "avx2")
-
-// F16C
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph, "V8sV4fIi", "ncV:128:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256, "V8sV8fIi", "ncV:256:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps, "V4fV8s", "ncV:128:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256, "V8fV8s", "ncV:256:", "f16c")
-
-// RDRAND
-TARGET_BUILTIN(__builtin_ia32_rdrand16_step, "UiUs*", "n", "rdrnd")
-TARGET_BUILTIN(__builtin_ia32_rdrand32_step, "UiUi*", "n", "rdrnd")
-
-// FXSR
-TARGET_BUILTIN(__builtin_ia32_fxrstor, "vv*", "n", "fxsr")
-TARGET_BUILTIN(__builtin_ia32_fxsave, "vv*", "n", "fxsr")
-
-// XSAVE
-TARGET_BUILTIN(__builtin_ia32_xsave, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xrstor, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xgetbv, "UOiUi", "n", "xsave")
-TARGET_HEADER_BUILTIN(_xgetbv, "UWiUi", "nh", IMMINTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_xsetbv, "vUiUOi", "n", "xsave")
-TARGET_HEADER_BUILTIN(_xsetbv, "vUiUWi", "nh", IMMINTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_xsaveopt, "vv*UOi", "n", "xsaveopt")
-TARGET_BUILTIN(__builtin_ia32_xrstors, "vv*UOi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_xsavec, "vv*UOi", "n", "xsavec")
-TARGET_BUILTIN(__builtin_ia32_xsaves, "vv*UOi", "n", "xsaves")
-
-// SHSTK
-TARGET_BUILTIN(__builtin_ia32_incsspd, "vUi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rdsspd, "UiUi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_saveprevssp, "v", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rstorssp, "vv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrssd, "vUiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrussd, "vUiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_setssbsy, "v", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_clrssbsy, "vv*", "n", "shstk")
-
-//CLFLUSHOPT
-TARGET_BUILTIN(__builtin_ia32_clflushopt, "vvC*", "n", "clflushopt")
-
-//CLWB
-TARGET_BUILTIN(__builtin_ia32_clwb, "vvC*", "n", "clwb")
-
-//WB[NO]INVD
-TARGET_BUILTIN(__builtin_ia32_wbinvd, "v", "n", "")
-TARGET_BUILTIN(__builtin_ia32_wbnoinvd, "v", "n", "wbnoinvd")
-
-// ADX
-TARGET_BUILTIN(__builtin_ia32_addcarryx_u32, "UcUcUiUiUi*", "nE", "")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u32, "UcUcUiUiUi*", "nE", "")
-
-// RDSEED
-TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "n", "rdseed")
-TARGET_BUILTIN(__builtin_ia32_rdseed32_step, "UiUi*", "n", "rdseed")
-
-// LZCNT
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "ncE", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "ncE", "lzcnt")
-
-// BMI
-TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "ncE", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "ncE", "")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "ncE", "")
-
-// BMI2
-TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "ncE", "bmi2")
-
-// TBM
-TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "ncE", "tbm")
-
-// LWP
-TARGET_BUILTIN(__builtin_ia32_llwpcb, "vv*", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_slwpcb, "v*", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpins32, "UcUiUiIUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpval32, "vUiUiIUi", "n", "lwp")
-
-// SHA
-TARGET_BUILTIN(__builtin_ia32_sha1rnds4, "V4iV4iV4iIc", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1nexte, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1msg1, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1msg2, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256rnds2, "V4iV4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256msg1, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256msg2, "V4iV4iV4i", "ncV:128:", "sha")
-
-// FMA
-TARGET_BUILTIN(__builtin_ia32_vfmaddps, "V4fV4fV4fV4f", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd, "V2dV2dV2dV2d", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3, "V4fV4fV4fV4f", "ncV:128:", "fma")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3, "V2dV2dV2dV2d", "ncV:128:", "fma")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss, "V4fV4fV4fV4f", "ncV:128:", "fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd, "V2dV2dV2dV2d", "ncV:128:", "fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps, "V4fV4fV4fV4f", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd, "V2dV2dV2dV2d", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256, "V8fV8fV8fV8f", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256, "V4dV4dV4dV4d", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "ncV:256:", "fma|fma4")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-
-// XOP
-TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsww, "V8sV8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdd, "V4iV4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdd, "V4iV4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdql, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdql, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdqh, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdqh, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmadcsswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmadcswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-
-TARGET_BUILTIN(__builtin_ia32_vphaddbw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddbd, "V4iV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddbq, "V2OiV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddwq, "V2OiV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadddq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubd, "V4iV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubq, "V2OiV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadduwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadduwq, "V2OiV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddudq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubbw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubdq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpperm, "V16cV16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotb, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotd, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotbi, "V16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotwi, "V8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotdi, "V4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotqi, "V2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlb, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshld, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshab, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshaw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshad, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshaq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomub, "V16cV16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomuw, "V8sV8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomud, "V4iV4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomuq, "V2OiV2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomb, "V16cV16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomw, "V8sV8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomd, "V4iV4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomq, "V2OiV2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2pd, "V2dV2dV2dV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2pd256, "V4dV4dV4dV4OiIc", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2ps, "V4fV4fV4fV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2ps256, "V8fV8fV8fV8iIc", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczss, "V4fV4f", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczsd, "V2dV2d", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczps, "V4fV4f", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczpd, "V2dV2d", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczps256, "V8fV8f", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczpd256, "V4dV4d", "ncV:256:", "xop")
-
-TARGET_BUILTIN(__builtin_ia32_xbegin, "i", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xend, "v", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xabort, "vIc", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xtest, "i", "n", "rtm")
-
-BUILTIN(__builtin_ia32_rdpmc, "UOii", "")
-BUILTIN(__builtin_ia32_rdtsc, "UOi", "")
-BUILTIN(__rdtsc, "UOi", "")
-BUILTIN(__builtin_ia32_rdtscp, "UOiUi*", "")
-
-TARGET_BUILTIN(__builtin_ia32_rdpid, "Ui", "n", "rdpid")
-TARGET_BUILTIN(__builtin_ia32_rdpru, "ULLii", "n", "rdpru")
-
-// PKU
-TARGET_BUILTIN(__builtin_ia32_rdpkru, "Ui", "n", "pku")
-TARGET_BUILTIN(__builtin_ia32_wrpkru, "vUi", "n", "pku")
-
-// AVX-512
-TARGET_BUILTIN(__builtin_ia32_sqrtpd512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_sqrtps512, "V16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_cmpps512_mask,   "UsV16fV16fIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpps256_mask,   "UcV8fV8fIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpps128_mask,   "UcV4fV4fIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmppd512_mask, "UcV8dV8dIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmppd256_mask, "UcV4dV4dIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmppd128_mask, "UcV2dV2dIiUc", "ncV:128:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_minps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_minpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtdq2ps512_mask, "V16fV16iV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtudq2ps512_mask, "V16fV16iV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps512_mask, "V8fV8dV8fUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph512_mask, "V16sV16fIiV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps512_mask, "V16fV16sV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmuldq512, "V8OiV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8OiV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8OiOiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadups512_mask, "V16ffC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadaps512_mask, "V16fV16fC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadupd512_mask, "V8ddC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadapd512_mask, "V8dV8dC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedqudi512_mask, "vOi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedqusi512_mask, "vi*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeupd512_mask, "vd*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vV8d*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vf*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vV16f*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignd512, "V16iV16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f,evex512")
-
-// AVX-VNNI and AVX512-VNNI
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-
-// AVX-VNNI-INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-
-// MOVRS
-TARGET_BUILTIN(__builtin_ia32_prefetchrs, "vvC*", "nc", "movrs")
-
-TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4dvC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4di, "V4OiV4OivC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4sf, "V4fV4fvC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4si, "V4iV4ivC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div8sf, "V4fV4fvC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div8si, "V4iV4ivC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv2df, "V2dV2dvC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv2di, "V2OiV2OivC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4df, "V4dV4dvC*V4iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4di, "V4OiV4OivC*V4iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4sf, "V4fV4fvC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4si, "V4iV4ivC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv8sf, "V8fV8fvC*V8iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv8si, "V8iV8ivC*V8iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8df, "V8dV8dvC*V8iUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16sf, "V16fV16fvC*V16iUsIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8df, "V8dV8dvC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16sf, "V8fV8fvC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8di, "V8OiV8OivC*V8iUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16si, "V16iV16ivC*V16iUsIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8di, "V8OiV8OivC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16si, "V8iV8ivC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8df, "vv*UcV8iV8dIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16sf, "vv*UsV16iV16fIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8df,  "vv*UcV8OiV8dIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16sf, "vv*UcV8OiV8fIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8di,  "vv*UcV8iV8OiIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vv*UcV8OiV8OiIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8OiV8iIi", "nV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_knotqi, "UcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_knotsi, "UiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_knotdi, "UOiUOi", "nc", "avx512bw")
-
-TARGET_BUILTIN(__builtin_ia32_cmpb128_mask, "UsV16cV16cIiUs", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpd128_mask, "UcV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpq128_mask, "UcV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpw128_mask, "UcV8sV8sIiUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpb256_mask, "UiV32cV32cIiUi", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpd256_mask, "UcV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpq256_mask, "UcV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpw256_mask, "UsV16sV16sIiUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpb512_mask, "UOiV64cV64cIiUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpd512_mask, "UsV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpq512_mask, "UcV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpw512_mask, "UiV32sV32sIiUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpb128_mask, "UsV16cV16cIiUs", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpd128_mask, "UcV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpq128_mask, "UcV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpw128_mask, "UcV8sV8sIiUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpb256_mask, "UiV32cV32cIiUi", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpd256_mask, "UcV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpq256_mask, "UcV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpw256_mask, "UsV16sV16sIiUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpb512_mask, "UOiV64cV64cIiUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpd512_mask, "UsV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpq512_mask, "UcV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpw512_mask, "UiV32sV32sIiUi", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_packssdw512, "V32sV16iV16i", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packsswb512, "V64cV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packusdw512, "V32sV16iV16i", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packuswb512, "V64cV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pavgb512, "V64cV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pavgw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pshufb512, "V64cV64cV64c", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_128, "V4iV4i", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_256, "V8iV8i", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_512, "V8OiV8Oi", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_512, "V16iV16i", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_512, "V16iV16i", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_512, "V8OiV8Oi", "ncV:512:", "avx512cd,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb128_mask, "UsV16cV16cUs", "ncV:128:", "avx512vl,avx512bitalg")
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb256_mask, "UiV32cV32cUi", "ncV:256:", "avx512vl,avx512bitalg")
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb512_mask, "UOiV64cV64cUOi", "ncV:512:", "avx512bitalg,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmulhw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_addpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_addps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_divpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_divps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_subpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_subps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw512, "V32sV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd512, "V16iV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_addss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_addsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-
-TARGET_BUILTIN(__builtin_ia32_compressdf128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdf256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdi128_mask, "V2OiV2OiV2OiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdi256_mask, "V4OiV4OiV4OiUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_compresshi128_mask, "V8sV8sV8sUc", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compresshi256_mask, "V16sV16sV16sUs", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressqi128_mask, "V16cV16cV16cUs", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressqi256_mask, "V32cV32cV32cUi", "ncV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_compresssf128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssf256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssi128_mask, "V4iV4iV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssi256_mask, "V8iV8iV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_compressstorehi128_mask, "vV8s*V8sUc", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstorehi256_mask, "vV16s*V16sUs", "nV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi128_mask, "vV16c*V16cUs", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi256_mask, "vV32c*V32cUi", "nV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_compressstoresf128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresf256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps_mask, "V4fV2dV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq256_mask, "V4iV4dV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq128_mask, "V4iV4fV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq256_mask, "V8iV8fV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq256_mask, "V4iV4dV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq128_mask, "V4iV4fV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq256_mask, "V8iV8fV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddf128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddf256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddi128_mask, "V2OiV2OiV2OiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddi256_mask, "V4OiV4OiV4OiUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_expandhi128_mask, "V8sV8sV8sUc", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandhi256_mask, "V16sV16sV16sUs", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandqi128_mask, "V16cV16cV16cUs", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandqi256_mask, "V32cV32cV32cUi", "ncV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_expandloaddf128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddf256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi128_mask, "V4iV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_expandloadhi128_mask, "V8sV8sC*V8sUc", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadhi256_mask, "V16sV16sC*V16sUs", "nV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi128_mask, "V16cV16cC*V16cUs", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi256_mask, "V32cV32cC*V32cUi", "nV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_expandloadsf128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsf256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsf128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsf256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsi128_mask, "V4iV4iV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsi256_mask, "V8iV8iV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexppd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexppd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefpd128_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefpd256_mask, "V4dV4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefps128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefps256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2df, "vv*UcV2OiV2dIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2di, "vv*UcV2OiV2OiIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4df, "vv*UcV4OiV4dIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4di, "vv*UcV4OiV4OiIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4sf, "vv*UcV2OiV4fIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4si, "vv*UcV2OiV4iIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8sf, "vv*UcV4OiV4fIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8si, "vv*UcV4OiV4iIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2df, "vv*UcV4iV2dIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2di, "vv*UcV4iV2OiIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4df, "vv*UcV4iV4dIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4di, "vv*UcV4iV4OiIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4sf, "vv*UcV4iV4fIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4si, "vv*UcV4iV4iIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8sf, "vv*UcV8iV8fIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8si, "vv*UcV8iV8iIi", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard512, "V16iV16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd128, "V2dV2dV2OiV2d", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd256, "V4dV4dV4OiV4d", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd512, "V8dV8dV8OiV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps128, "V4fV4fV4iV4f", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps256, "V8fV8fV8iV8f", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps512, "V16fV16fV16iV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi128, "V16cV16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi256, "V32cV32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi512, "V64cV64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi512, "V32sV32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshldd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldd512, "V16iV16iV16iIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldw128, "V8sV8sV8sIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldw256, "V16sV16sV16sIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldw512, "V32sV32sV32sIi", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshldvd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw512, "V32sV32sV32sV32s", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw512, "V32sV32sV32sV32s", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshrdd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd512, "V16iV16iV16iIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw128, "V8sV8sV8sIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw256, "V16sV16sV16sIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw512, "V32sV32sV32sIi", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmovswb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovwb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2ps128_mask, "V4fV2OiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps128_mask, "V4fV2OiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangepd128_mask, "V2dV2dV2dIiV2dUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangepd256_mask, "V4dV4dV4dIiV4dUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangeps128_mask, "V4fV4fV4fIiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangeps256_mask, "V8fV8fV8fIiV8fUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangesd128_round_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangess128_round_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducepd128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducepd256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reduceps128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reduceps256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducesd_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducess_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_pmovswb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovswb256_mask, "V16cV16sV16cUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb256_mask, "V16cV16sV16cUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovwb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2pd512_mask, "V8dV8OiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2ps512_mask, "V8fV8OiV8fUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2pd512_mask, "V8dV8OiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps512_mask, "V8fV8OiV8fUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_rangepd512_mask, "V8dV8dV8dIiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_rangeps512_mask, "V16fV16fV16fIiV16fUsIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_reducepd512_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduceps512_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_prold512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolq512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prold128, "V4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prold256, "V8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolq128, "V2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolq256, "V4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvd512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolvq512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prord512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorq512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolvd128, "V4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvd256, "V8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prord128, "V4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prord256, "V8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorq128, "V2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorq256, "V4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvd512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorvq512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorvd128, "V4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvd256, "V8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pshufhw512, "V32sV32sIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pshuflw512, "V32sV32sIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllwi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psllv8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pslldi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrlv8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrldi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrav8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psravq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psravq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrawi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlwi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pslldqi512_byteshift, "V8OiV8OiIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrldqi512_byteshift, "V8OiV8OiIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load512_mask, "V16iV16iC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store512_mask, "vV16i*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load512_mask, "V8OiV8OiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store512_mask, "vV8Oi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load128_mask, "V2OiV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kunpckdi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kunpcksi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi512_mask, "V32sV32sC*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi512_mask, "V64cV64cC*V64cUOi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_mask, "V8dV8dV8dV8OiIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_maskz, "V8dV8dV8dV8OiIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps512_mask, "V16fV16fV16fV16iIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps512_maskz, "V16fV16fV16fV16iIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmsd_mask, "V2dV2dV2dV2OiIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmsd_maskz, "V2dV2dV2dV2OiIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmss_mask, "V4fV4fV4fV4iIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmss_maskz, "V4fV4fV4fV4iIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getexpsd128_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getexpss128_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getmantsd_round_mask, "V2dV2dV2dIiV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getmantss_round_mask, "V4fV4fV4fIiV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi128_mask, "V8sV8sC*V8sUc", "nV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi256_mask, "V16sV16sC*V16sUs", "nV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi128_mask, "V16cV16cC*V16cUs", "nV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi256_mask, "V32cV32cC*V32cUi", "nV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_mask, "V2dV2dV2dV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_maskz, "V2dV2dV2dV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_mask, "V4dV4dV4dV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_maskz, "V4dV4dV4dV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps128_mask, "V4fV4fV4fV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps128_maskz, "V4fV4fV4fV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2OiV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadupd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadupd256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadups128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadups256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedquhi512_mask, "vV32s*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedquqi512_mask, "vV64c*V64cUOi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedquhi128_mask, "vV8s*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquhi256_mask, "vV16s*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV2d*V2dUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV4f*V4fUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqusi128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqusi256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeupd128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeupd256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeups128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeups256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_128, "V4iV4i", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_256, "V8iV8i", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2si32, "iV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi32, "UiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2si32, "iV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2usi32, "UiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2si32, "iV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi32, "UiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2si32, "iV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usi32, "UiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilps512, "V16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd512, "V8dV8dV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps512, "V16fV16fV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rndscalesd_round_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rndscaless_round_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scalefpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scalefps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scalefsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scalefss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_psradi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraq256, "V4OiV4OiV2Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraqi128, "V2OiV2Oii", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraqi256, "V4OiV4Oii", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pslld512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrad512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrld512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd512_mask, "V16iV16iV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd512_maskz, "V16iV16iV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogq512_mask, "V8OiV8OiV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogq512_maskz, "V8OiV8OiV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd128_mask, "V4iV4iV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd128_maskz, "V4iV4iV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd256_mask, "V8iV8iV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd256_maskz, "V8iV8iV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq128_mask, "V2OiV2OiV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq128_maskz, "V2OiV2OiV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq256_mask, "V4OiV4OiV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq256_maskz, "V4OiV4OiV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_f32x4, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_f64x2, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_i32x4, "V16iV16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_i64x2, "V8OiV8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shufpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shufps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_256, "V8fV8fV8fIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_256, "V4dV4dV4dIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_256, "V8iV8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_sqrtss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask512, "UOiV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b512, "V64cUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w512, "V32sUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask512, "UsV16i", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d512, "V16iUs", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q512, "V8OiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask512, "UcV8Oi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask128, "UsV16c", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask256, "UiV32c", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b128, "V16cUs", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b256, "V32cUi", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w128, "V8sUc", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w256, "V16sUs", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask128, "UcV4i", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask256, "UcV8i", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d128, "V4iUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d256, "V8iUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q128, "V2OiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q256, "V4OiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask128, "UcV2Oi", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask256, "UcV4Oi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovswb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovswb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd256_mask, "V4iV4OiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd256_mask, "V4iV4OiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovwb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovwb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovwb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf32x8_mask, "V8fV16fIiV8fUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf64x2_512_mask, "V2dV8dIiV2dUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti32x8_mask, "V8iV16iIiV8iUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti64x2_512_mask, "V2OiV8OiIiV2OiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti32x4_mask, "V4iV16iIiV4iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti64x4_mask, "V4OiV8OiIiV4OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf64x2_256_mask, "V2dV4dIiV2dUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extracti64x2_256_mask, "V2OiV4OiIiV2OiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf32x4_256_mask, "V4fV8fIiV4fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extracti32x4_256_mask, "V4iV8iIiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x8, "V16fV16fV8fIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x2_512, "V8dV8dV2dIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti32x8, "V16iV16iV8iIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti64x2_512, "V8OiV8OiV2OiIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x4, "V8dV8dV4dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti64x4, "V8OiV8OiV4OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x2_256, "V4dV4dV2dIi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_inserti64x2_256, "V4OiV4OiV2OiIi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x4_256, "V8fV8fV4fIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_inserti32x4_256, "V8iV8iV4iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x4, "V16fV16fV4fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti32x4, "V16iV16iV4iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantpd128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantpd256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantps128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantps256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantpd512_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantps512_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getexppd512_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getexpps512_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask,  "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_maskz, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask3, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmsubsd3_mask3, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmsubss3_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_permdf512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permdi512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarhi512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvardf512, "V8dV8dV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvardi512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarsf512, "V16fV16fV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarsi512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarqi512, "V64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarqi128, "V16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarqi256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarhi128, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarhi256, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvardf256, "V4dV4dV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvardi256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd128_mask, "UcV2dIiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd256_mask, "UcV4dIiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps128_mask, "UcV4fIiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps256_mask, "UcV8fIiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps512_mask, "UsV16fIiUs", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd512_mask, "UcV8dIiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasssd_mask, "UcV2dIiUc", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_fpclassss_mask, "UcV4fIiUc", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddhi, "UsUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kadddi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kandhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kandsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kanddi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandnqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kandnhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kandnsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandndi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_korqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_korhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_korsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestcqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kortestzqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kortestchi, "iUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kortestzhi, "iUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kortestcsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestzsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestcdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestzdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestcqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestzqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestchi, "iUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestzhi, "iUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestcsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestzsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestcdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestzdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxnorqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxnorsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxnordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxorqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxorsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftliqi, "UcUcIUi", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kshiftlihi, "UsUsIUi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kshiftlisi, "UiUiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftlidi, "UOiUOiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftriqi, "UcUcIUi", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kshiftrihi, "UsUsIUi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kshiftrisi, "UiUiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftridi, "UOiUOiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kmovb, "UcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kmovw, "UsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kmovd, "UiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kmovq, "UOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_palignr512, "V64cV64cV64cIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw128, "V8sV16cV16cIi", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw256, "V16sV32cV32cIi", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw512, "V32sV64cV64cIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psadbw512, "V8OiV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressdf512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressdi512_mask, "V8OiV8OiV8OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresshi512_mask, "V32sV32sV32sUi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressqi512_mask, "V64cV64cV64cUOi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresssf512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresssi512_mask, "V16iV16iV16iUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpsd_mask, "UcV2dV2dIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cmpss_mask, "UcV4fV4fIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pshufd512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expanddf512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expanddi512_mask, "V8OiV8OiV8OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandhi512_mask, "V32sV32sV32sUi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandqi512_mask, "V64cV64cV64cUOi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloaddf512_mask, "V8dV8dC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi512_mask, "V8OiV8OiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadhi512_mask, "V32sV32sC*V32sUi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi512_mask, "V64cV64cC*V64cUOi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadsf512_mask, "V16fV16fC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi512_mask, "V16iV16iC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandsf512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandsi512_mask, "V16iV16iV16iUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pd512_mask, "V8dV8fV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf512_mask, "vV8d*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi512_mask, "vV8Oi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstorehi512_mask, "vV32s*V32sUi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi512_mask, "vV64c*V64cUOi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoresf512_mask, "vV16f*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi512_mask, "vV16i*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps_mask, "V4fV8sV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256_mask, "V8fV8sV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph_mask, "V8sV4fIiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256_mask, "V8sV8fIiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask512, "UiV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask128, "UcV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask256, "UsV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2ss_round_mask, "V4fV4fV2dV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2ss32, "V4fV4fiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtss2sd_round_mask, "V2dV2dV4fV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2ss32, "V4fV4fUiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb512, "V64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb128, "V16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-
-// bf16 intrinsics
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_128, "V8yV4fV4f", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_256, "V16yV8fV8f", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_512, "V32yV16fV16f", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_128_mask, "V8yV4fV8yUc", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_256_mask, "V8yV8fV8yUc", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_512_mask, "V16yV16fV16yUs", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_128, "V4fV4fV8yV8y", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_256, "V8fV8fV16yV16y", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_512, "V16fV16fV32yV32y", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtsbf162ss_32, "fy", "nc", "avx512bf16")
-
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_512, "vV8OiV8OiUc*Uc*", "nV:512:", "avx512vp2intersect,evex512")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_256, "vV4OiV4OiUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_128, "vV2OiV2OiUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_512, "vV16iV16iUs*Us*", "nV:512:", "avx512vp2intersect,evex512")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
-
-// AVX512 fp16 intrinsics
-TARGET_BUILTIN(__builtin_ia32_vcomish,       "iV8xV8xIiIi",    "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_addph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_subph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_divph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_minph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_minph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_minph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_maxph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_maxph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_addsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_divsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_mulsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_subsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_maxsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_minsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_cmpph512_mask, "UiV32xV32xIiUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpph256_mask, "UsV16xV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpph128_mask, "UcV8xV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8xC*V8xUc", "nV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_rcpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcpph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_getexpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpph512_mask, "V32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_scalefph128_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefph256_mask, "V16xV16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduceph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduceph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduceph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcpsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_rsqrtsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_getmantsh_round_mask, "V8xV8xV8xIiV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_getexpsh128_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_scalefsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_rndscalesh_round_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_reducesh_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_sqrtph, "V8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtph256, "V16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtph512, "V32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_sqrtsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_fpclassph128_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassph256_mask, "UsV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassph512_mask, "UiV32xIiUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasssh_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd128_mask, "V2dV8xV2dUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_mask, "V4dV8xV4dUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd512_mask, "V8dV8xV8dUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2ss_round_mask, "V4fV4fV8xV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2sh_round_mask, "V8xV8xV4fV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2sh_round_mask, "V8xV8xV2dV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2sd_round_mask, "V2dV2dV8xV2dUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph128_mask, "V8xV8sV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_mask, "V16xV16sV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph512_mask, "V32xV32sV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph128_mask, "V8xV8UsV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_mask, "V16xV16UsV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph512_mask, "V32xV32UsV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph128_mask, "V8xV4iV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_mask, "V8xV8iV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph512_mask, "V16xV16iV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph128_mask, "V8xV4UiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_mask, "V8xV8UiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph512_mask, "V16xV16UiV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph128_mask, "V8xV2OiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_mask, "V8xV4OiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph512_mask, "V8xV8OiV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph128_mask, "V8xV2UOiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_mask, "V8xV4UOiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph512_mask, "V8xV8UOiV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtusi2sh, "V8xV8xUiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsi2sh, "V8xV8xiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx128_mask, "V4fV8xV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_mask, "V8fV8xV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx512_mask, "V16fV16xV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx128_mask, "V8xV4fV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask,  "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_maskz, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_maskz,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask3,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask3,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask3,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask3,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-
-// generic select intrinsics
-TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectb_512, "V64cUOiV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectw_128, "V8sUcV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectw_256, "V16sUsV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectw_512, "V32sUiV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_256, "V16yUsV16yV16y", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_512, "V32yUiV32yV32y", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectps_128, "V4fUcV4fV4f", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_selectsbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16")
-TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f")
-
-// generic reduction intrinsics
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph512, "xxV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph512, "xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph512, "xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph512, "xxV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
-
-// MONITORX/MWAITX
-TARGET_BUILTIN(__builtin_ia32_monitorx, "vvC*UiUi", "n", "mwaitx")
-TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "n", "mwaitx")
-
-// WAITPKG
-TARGET_BUILTIN(__builtin_ia32_umonitor, "vvC*", "n", "waitpkg")
-TARGET_BUILTIN(__builtin_ia32_umwait, "UcUiUiUi", "n", "waitpkg")
-TARGET_BUILTIN(__builtin_ia32_tpause, "UcUiUiUi", "n", "waitpkg")
-
-// CLZERO
-TARGET_BUILTIN(__builtin_ia32_clzero, "vv*", "n", "clzero")
-
-// CLDEMOTE
-TARGET_BUILTIN(__builtin_ia32_cldemote, "vvC*", "n", "cldemote")
-
-// Direct Move
-TARGET_BUILTIN(__builtin_ia32_directstore_u32, "vUi*Ui", "n", "movdiri")
-TARGET_BUILTIN(__builtin_ia32_movdir64b, "vv*vC*", "n", "movdir64b")
-
-// PTWRITE
-TARGET_BUILTIN(__builtin_ia32_ptwrite32, "vUi", "n", "ptwrite")
-
-// INVPCID
-TARGET_BUILTIN(__builtin_ia32_invpcid, "vUiv*", "nc", "invpcid")
-
-// ENQCMD
-TARGET_BUILTIN(__builtin_ia32_enqcmd, "Ucv*vC*", "n", "enqcmd")
-TARGET_BUILTIN(__builtin_ia32_enqcmds, "Ucv*vC*", "n", "enqcmd")
-
-// KEY LOCKER
-TARGET_BUILTIN(__builtin_ia32_loadiwkey, "vV2OiV2OiV2OiUi", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey128_u32, "UiUiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey256_u32, "UiUiV2OiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-
-// SERIALIZE
-TARGET_BUILTIN(__builtin_ia32_serialize, "v", "n", "serialize")
-
-// TSXLDTRK
-TARGET_BUILTIN(__builtin_ia32_xsusldtrk, "v", "n", "tsxldtrk")
-TARGET_BUILTIN(__builtin_ia32_xresldtrk, "v", "n", "tsxldtrk")
-
-// RAO-INT
-TARGET_BUILTIN(__builtin_ia32_aadd32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aand32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aor32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_axor32, "vv*Si", "n", "raoint")
-
-// MSVC
-TARGET_HEADER_BUILTIN(_BitScanForward, "UcUNi*UNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_BitScanReverse, "UcUNi*UNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_ReadWriteBarrier, "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_ReadBarrier,      "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_WriteBarrier,     "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__cpuid,   "vi*i",  "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__cpuidex, "vi*ii", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__emul,  "LLiii",    "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__emulu, "ULLiUiUi", "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_AddressOfReturnAddress, "v*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__stosb, "vUc*Ucz", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__int2c, "v",       "nhr", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__ud2,   "v",       "nhr", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__readfsbyte,  "UcUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__readgsbyte,  "UcUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-// AVX10.2 VNNI FP16
-TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 VNNI INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 VNNI INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-
-// AVX10.2 VMPSADBW
-TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 YMM Rounding
-TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppd256_round_mask, "UcV4dV4dIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmpph256_round_mask, "UsV16xV16xIiUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmpps256_round_mask, "UcV8fV8fIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_round_mask, "V8xV8iV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ps256_round_mask, "V8fV8iV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_round_mask, "V8xV4dV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ps256_round_mask, "V4fV4dV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_round_mask, "V4dV8xV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_round_mask, "V8fV8xV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2pd256_round_mask, "V4dV4fV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_round_mask, "V8xV8fV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2pd256_round_mask, "V4dV4LLiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_round_mask, "V8xV4LLiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ps256_round_mask, "V4fV4LLiV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_round_mask, "V8xV8UiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ps256_round_mask, "V8fV8UiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2pd256_round_mask, "V4dV4ULLiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_round_mask, "V8xV4ULLiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ps256_round_mask, "V4fV4ULLiV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_round_mask, "V16xV16UsV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_round_mask, "V16xV16sV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_mask, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_maskz, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_mask, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_maskz, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppd256_round_mask, "V4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexpph256_round_mask, "V16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexpps256_round_mask, "V8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrangepd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrangeps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreduceph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreduceps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscaleph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscaleps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtpd256_round, "V4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtph256_round, "V16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtps256_round, "V8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-
-// AVX-VNNI-INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-
-// AVX10.2 SATCVT-DS
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2sis32, "iV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usis32, "UiV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2sis32, "iV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usis32, "UiV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs128_mask,  "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs128_mask, "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
-
-// AVX-NE-CONVERT
-TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnesh2ps128, "V4fxC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnesh2ps256, "V8fxC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneebf162ps128, "V4fV8yC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneebf162ps256, "V8fV16yC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneeph2ps128, "V4fV8xC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneeph2ps256, "V8fV16xC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneobf162ps128, "V4fV8yC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneobf162ps256, "V8fV16yC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneoph2ps128, "V4fV8xC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneoph2ps256, "V8fV16xC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16128, "V8yV4f", "nV:128:", "avx512bf16,avx512vl|avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16256, "V8yV8f", "nV:256:", "avx512bf16,avx512vl|avxneconvert")
-
-// SHA512
-TARGET_BUILTIN(__builtin_ia32_vsha512msg1, "V4ULLiV4ULLiV2ULLi", "nV:256:", "sha512")
-TARGET_BUILTIN(__builtin_ia32_vsha512msg2, "V4ULLiV4ULLiV4ULLi", "nV:256:", "sha512")
-TARGET_BUILTIN(__builtin_ia32_vsha512rnds2, "V4ULLiV4ULLiV4ULLiV2ULLi", "nV:256:", "sha512")
-
-TARGET_HEADER_BUILTIN(_InterlockedAnd64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedDecrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchange64,    "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchangeAdd64, "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchangeSub64, "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedIncrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedOr64,          "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedXor64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-// SM3
-TARGET_BUILTIN(__builtin_ia32_vsm3msg1, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
-TARGET_BUILTIN(__builtin_ia32_vsm3msg2, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
-TARGET_BUILTIN(__builtin_ia32_vsm3rnds2, "V4UiV4UiV4UiV4UiIUi", "nV:128:", "sm3")
-
-// SM4
-TARGET_BUILTIN(__builtin_ia32_vsm4key4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
-
-// SM4_EVEX
-TARGET_BUILTIN(__builtin_ia32_vsm4key4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
-
-// AVX10 MINMAX
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-256")
-
-// AVX10.2 SATCVT
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-
-// AVX10.2 CONVERT
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx128_mask, "V8xV4fV4fV8xUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx256_mask, "V16xV8fV8fV16xUsIi", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx512_mask, "V32xV16fV16fV32xUiIi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph128_mask, "V8xV16cV8xUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph256_mask, "V16xV16cV16xUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph512_mask, "V32xV32cV32xUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-
-// AVX10.2 BF16
-TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
-#undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cf8d2771310e3..73678bc868bfd 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -15,8 +15,26 @@ include "clang/Basic/BuiltinsBase.td"
 class X86Builtin : TargetBuiltin {
   let Spellings = ["__builtin_ia32_" # NAME];
   let Prototype = prototype;
+  let EnableOpenCLLong = 1;
 }
 
+class X86NoPrefixBuiltin : TargetBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+class X86LibBuiltin : TargetLibBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+def rdpmc : X86Builtin<"unsigned long long int(int)">;
+def rdtsc : X86Builtin<"unsigned long long int()">;
+def __rdtsc : X86NoPrefixBuiltin<"unsigned long long int()"> {
+  let EnableOpenCLLong = 1;
+}
+def rdtscp : X86Builtin<"unsigned long long int(unsigned int*)">;
+
 // Undefined Values
 def undef128 : X86Builtin<"_Vector<2, double>()"> {
   let Attributes = [Const, NoThrow, RequiredVectorWidth<128>];
@@ -135,3 +153,5375 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in
     def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
 }
+
+
+// Mechanically ported builtins from the original `.def` file.
+//
+// TODO: Build structured ways of synthesizing relevant groups and improve the
+// organization of the builtins below this point (and move them above it). The
+// current formulation is based on what was easiest to recognize from the
+// pre-TableGen version.
+
+let Features = "mmx", Attributes = [NoThrow, Const] in {
+  def _mm_prefetch : X86NoPrefixBuiltin<"void(char const *, int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def ldmxcsr : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_setcsr : X86LibBuiltin<"void(unsigned int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def stmxcsr : X86Builtin<"unsigned int()">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_getcsr : X86LibBuiltin<"unsigned int()">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtss2si : X86Builtin<"int(_Vector<4, float>)">;
+  def cvttss2si : X86Builtin<"int(_Vector<4, float>)">;
+}
+
+let Features = "sse", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movmskps : X86Builtin<"int(_Vector<4, float>)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def sfence : X86Builtin<"void()">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_sfence : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskmovdqu : X86Builtin<"void(_Vector<16, char>, _Vector<16, char>, char *)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
+  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def movnti : X86Builtin<"void(int *, int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
+  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
+  def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
+  def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
+  def cvtsd2si : X86Builtin<"int(_Vector<2, double>)">;
+  def cvttsd2si : X86Builtin<"int(_Vector<2, double>)">;
+  def cvtsd2ss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<2, double>)">;
+  def cvtps2dq : X86Builtin<"_Vector<4, int>(_Vector<4, float>)">;
+  def cvttps2dq : X86Builtin<"_Vector<4, int>(_Vector<4, float>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def clflush : X86Builtin<"void(void const *)">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_clflush : X86LibBuiltin<"void(void const *)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def lfence : X86Builtin<"void()">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_lfence : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def mfence : X86Builtin<"void()">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_mfence : X86LibBuiltin<"void()">;
+}
+
+let Attributes = [NoThrow] in {
+  def pause : X86Builtin<"void()">;
+}
+
+let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_pause : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def psrld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def pslldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def psllqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+  def psrlwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def psrldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def psrlqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+  def psrawi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def psradi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
+  def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+  def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "sse3", Attributes = [NoThrow] in {
+  def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
+  def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
+}
+
+let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
+}
+
+let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
+}
+
+let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+  def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+  def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+  def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+  def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+  def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
+  def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+  def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def roundpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
+  def dpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+  def dppd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant char)">;
+  def ptestz128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def ptestc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def ptestnzc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def phminposuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>)">;
+  def vec_ext_v16qi : X86Builtin<"char(_Vector<16, char>, _Constant int)">;
+  def vec_set_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, char, _Constant int)">;
+  def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
+}
+
+let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpestrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestri128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpistria128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistric128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistrio128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistris128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistriz128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpestria128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestric128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestrio128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestris128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestriz128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+}
+
+let Features = "crc32", Attributes = [NoThrow, Const] in {
+  def crc32qi : X86Builtin<"unsigned int(unsigned int, unsigned char)">;
+  def crc32hi : X86Builtin<"unsigned int(unsigned int, unsigned short)">;
+  def crc32si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "sse4a", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def extrqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char, _Constant char)">;
+  def extrq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<16, char>)">;
+  def insertqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char, _Constant char)">;
+  def insertq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "sse4a", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movntsd : X86Builtin<"void(double *, _Vector<2, double>)">;
+  def movntss : X86Builtin<"void(float *, _Vector<4, float>)">;
+}
+
+let Features = "aes", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def aesenc128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesenclast128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesdec128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesdeclast128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesimc128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+  def aeskeygenassist128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesenc256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesenc512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesenclast256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesenclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesdec256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesdec512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesdeclast256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesdeclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8affineinvqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8affineqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8mulb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8mulb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermilvarpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>)">;
+  def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
+  def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
+  def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
+  def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
+  def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
+  def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
+  def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
+  def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
+  def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
+  def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">;
+  def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
+  def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
+  def cvtpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
+  def cvttps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
+  def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermilpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
+  def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+  def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
+  def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
+  def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
+  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def roundps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vtestzpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestnzcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestzps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+  def vtestcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+  def vtestnzcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vtestzpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestnzcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestzps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def vtestcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def vtestnzcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def ptestz256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def ptestc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def ptestnzc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def movmskpd256 : X86Builtin<"int(_Vector<4, double>)">;
+  def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow] in {
+  def vzeroall : X86Builtin<"void()">;
+  def vzeroupper : X86Builtin<"void()">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def lddqu256 : X86Builtin<"_Vector<32, char>(char const *)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskloadpd : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, long long int>)">;
+  def maskloadps : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, int>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskloadpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, long long int>)">;
+  def maskloadps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, int>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskstorepd : X86Builtin<"void(_Vector<2, double *>, _Vector<2, long long int>, _Vector<2, double>)">;
+  def maskstoreps : X86Builtin<"void(_Vector<4, float *>, _Vector<4, int>, _Vector<4, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskstorepd256 : X86Builtin<"void(_Vector<4, double *>, _Vector<4, long long int>, _Vector<4, double>)">;
+  def maskstoreps256 : X86Builtin<"void(_Vector<8, float *>, _Vector<8, int>, _Vector<8, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vec_ext_v32qi : X86Builtin<"char(_Vector<32, char>, _Constant int)">;
+  def vec_ext_v16hi : X86Builtin<"short(_Vector<16, short>, _Constant int)">;
+  def vec_ext_v8si : X86Builtin<"int(_Vector<8, int>, _Constant int)">;
+  def vec_set_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, char, _Constant int)">;
+  def vec_set_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, short, _Constant int)">;
+  def vec_set_v8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int, _Constant int)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+  def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+  def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
+  def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmulhuw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
+  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+  def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+  def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def psrlwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def psrldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psrlqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+  def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+  def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
+  def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+  def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskloadd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>)">;
+  def maskloadq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskloadd : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>)">;
+  def maskloadq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskstored256 : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, _Vector<8, int>)">;
+  def maskstoreq256 : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskstored : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, _Vector<4, int>)">;
+  def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, double const *, _Vector<4, int>, _Vector<4, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<2, long long int>, _Vector<2, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, double const *, _Vector<4, long long int>, _Vector<4, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<4, int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, float const *, _Vector<8, int>, _Vector<8, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<2, long long int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<4, long long int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_q : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int const *, _Vector<4, int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_q256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int const *, _Vector<4, int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_q : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int const *, _Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_q256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int const *, _Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<4, int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_d256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int const *, _Vector<8, int>, _Vector<8, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_d256 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<4, long long int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
+}
+
+let Features = "rdrnd", Attributes = [NoThrow] in {
+  def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
+  def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
+}
+
+let Features = "fxsr", Attributes = [NoThrow] in {
+  def fxrstor : X86Builtin<"void(void *)">;
+  def fxsave : X86Builtin<"void(void *)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsave : X86Builtin<"void(void *, unsigned long long int)">;
+  def xrstor : X86Builtin<"void(void *, unsigned long long int)">;
+  def xgetbv : X86Builtin<"unsigned long long int(unsigned int)">;
+}
+
+let Header = "immintrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _xgetbv : X86LibBuiltin<"uint64_t(unsigned int)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsetbv : X86Builtin<"void(unsigned int, unsigned long long int)">;
+}
+
+let Header = "immintrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _xsetbv : X86LibBuiltin<"void(unsigned int, uint64_t)">;
+}
+
+let Features = "xsaveopt", Attributes = [NoThrow] in {
+  def xsaveopt : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xrstors : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsavec", Attributes = [NoThrow] in {
+  def xsavec : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xsaves : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "shstk", Attributes = [NoThrow] in {
+  def incsspd : X86Builtin<"void(unsigned int)">;
+  def rdsspd : X86Builtin<"unsigned int(unsigned int)">;
+  def saveprevssp : X86Builtin<"void()">;
+  def rstorssp : X86Builtin<"void(void *)">;
+  def wrssd : X86Builtin<"void(unsigned int, void *)">;
+  def wrussd : X86Builtin<"void(unsigned int, void *)">;
+  def setssbsy : X86Builtin<"void()">;
+  def clrssbsy : X86Builtin<"void(void *)">;
+}
+
+let Features = "clflushopt", Attributes = [NoThrow] in {
+  def clflushopt : X86Builtin<"void(void const *)">;
+}
+
+let Features = "clwb", Attributes = [NoThrow] in {
+  def clwb : X86Builtin<"void(void const *)">;
+}
+
+let Attributes = [NoThrow] in {
+  def wbinvd : X86Builtin<"void()">;
+}
+
+let Features = "wbnoinvd", Attributes = [NoThrow] in {
+  def wbnoinvd : X86Builtin<"void()">;
+}
+
+let Attributes = [NoThrow, Constexpr] in {
+  def addcarryx_u32 : X86Builtin<"unsigned char(unsigned char, unsigned int, unsigned int, unsigned int *)">;
+  def subborrow_u32 : X86Builtin<"unsigned char(unsigned char, unsigned int, unsigned int, unsigned int *)">;
+}
+
+let Features = "rdseed", Attributes = [NoThrow] in {
+  def rdseed16_step : X86Builtin<"unsigned int(unsigned short *)">;
+  def rdseed32_step : X86Builtin<"unsigned int(unsigned int *)">;
+}
+
+let Features = "lzcnt", Attributes = [NoThrow, Const, Constexpr] in {
+  def lzcnt_u16 : X86Builtin<"unsigned short(unsigned short)">;
+  def lzcnt_u32 : X86Builtin<"unsigned int(unsigned int)">;
+}
+
+let Features = "bmi", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextr_u32 : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Attributes = [NoThrow, Const, Constexpr] in {
+  def tzcnt_u16 : X86Builtin<"unsigned short(unsigned short)">;
+  def tzcnt_u32 : X86Builtin<"unsigned int(unsigned int)">;
+}
+
+let Features = "bmi2", Attributes = [NoThrow, Const, Constexpr] in {
+  def bzhi_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def pdep_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def pext_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "tbm", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextri_u32 : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "lwp", Attributes = [NoThrow] in {
+  def llwpcb : X86Builtin<"void(void *)">;
+  def slwpcb : X86Builtin<"void *()">;
+  def lwpins32 : X86Builtin<"unsigned char(unsigned int, unsigned int, _Constant unsigned int)">;
+  def lwpval32 : X86Builtin<"void(unsigned int, unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sha1rnds4 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def sha1nexte : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha1msg1 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha1msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha256rnds2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def sha256msg1 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vfmaddpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def vfmaddsubps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vfmaddsubpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmsubpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmsubps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmsubaddpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmsubaddps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmacssww : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+  def vpmacsww : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+  def vpmacsswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmacswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmacssdd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpmacsdd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpmacssdql : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacsdql : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacssdqh : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacsdqh : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmadcsswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmadcswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vphaddbw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphaddbd : X86Builtin<"_Vector<4, int>(_Vector<16, char>)">;
+  def vphaddbq : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>)">;
+  def vphaddwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphaddwq : X86Builtin<"_Vector<2, long long int>(_Vector<8, short>)">;
+  def vphadddq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vphaddubw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphaddubd : X86Builtin<"_Vector<4, int>(_Vector<16, char>)">;
+  def vphaddubq : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>)">;
+  def vphadduwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphadduwq : X86Builtin<"_Vector<2, long long int>(_Vector<8, short>)">;
+  def vphaddudq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vphsubbw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphsubwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphsubdq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vpperm : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+  def vprotb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vprotw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vprotd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vprotq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vprotbi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant char)">;
+  def vprotwi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant char)">;
+  def vprotdi : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant char)">;
+  def vprotqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char)">;
+  def vpshlb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vpshlw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vpshld : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vpshlq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpshab : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vpshaw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vpshad : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vpshaq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpcomub : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def vpcomuw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant char)">;
+  def vpcomud : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def vpcomuq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+  def vpcomb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def vpcomw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant char)">;
+  def vpcomd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def vpcomq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+  def vpermil2pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermil2pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermil2ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermil2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfrczss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def vfrczsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def vfrczps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def vfrczpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfrczps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def vfrczpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
+}
+
+let Features = "rtm", Attributes = [NoThrow] in {
+  def xbegin : X86Builtin<"int()">;
+  def xend : X86Builtin<"void()">;
+  def xabort : X86Builtin<"void(_Constant char)">;
+  def xtest : X86Builtin<"int()">;
+}
+
+let Features = "rdpid", Attributes = [NoThrow] in {
+  def rdpid : X86Builtin<"unsigned int()">;
+}
+
+let Features = "rdpru", Attributes = [NoThrow], EnableOpenCLLong = 0 in {
+  def rdpru : X86Builtin<"unsigned long long int(int)">;
+}
+
+let Features = "pku", Attributes = [NoThrow] in {
+  def rdpkru : X86Builtin<"unsigned int()">;
+  def wrpkru : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def sqrtpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def sqrtps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14sd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+  def rsqrt14ss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rsqrt14pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def rsqrt14ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14sd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+  def rcp14ss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rcp14pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def rcp14ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def cvttps2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvttps2udq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvttpd2dq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cvttpd2udq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cmpps512_mask : X86Builtin<"unsigned short(_Vector<16, float>, _Vector<16, float>, _Constant int, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpps256_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Vector<8, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpps128_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmppd512_mask : X86Builtin<"unsigned char(_Vector<8, double>, _Vector<8, double>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmppd256_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Vector<4, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmppd128_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Vector<2, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rndscaleps_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def rndscalepd_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtps2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvtpd2dq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cvtps2udq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvtpd2udq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def minps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def minpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def maxps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def maxpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def cvtdq2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, int>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def cvtudq2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, int>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
+  def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+  def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def loaddqusi512_mask : X86Builtin<"_Vector<16, int>(int const *, _Vector<16, int>, unsigned short)">;
+  def loaddqudi512_mask : X86Builtin<"_Vector<8, long long int>(long long int const *, _Vector<8, long long int>, unsigned char)">;
+  def loadups512_mask : X86Builtin<"_Vector<16, float>(float const *, _Vector<16, float>, unsigned short)">;
+  def loadaps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float const *>, _Vector<16, float>, unsigned short)">;
+  def loadupd512_mask : X86Builtin<"_Vector<8, double>(double const *, _Vector<8, double>, unsigned char)">;
+  def loadapd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double const *>, _Vector<8, double>, unsigned char)">;
+  def storedqudi512_mask : X86Builtin<"void(long long int *, _Vector<8, long long int>, unsigned char)">;
+  def storedqusi512_mask : X86Builtin<"void(int *, _Vector<16, int>, unsigned short)">;
+  def storeupd512_mask : X86Builtin<"void(double *, _Vector<8, double>, unsigned char)">;
+  def storeapd512_mask : X86Builtin<"void(_Vector<8, double *>, _Vector<8, double>, unsigned char)">;
+  def storeups512_mask : X86Builtin<"void(float *, _Vector<16, float>, unsigned short)">;
+  def storeaps512_mask : X86Builtin<"void(_Vector<16, float *>, _Vector<16, float>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def alignq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def alignd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def alignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def alignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def alignq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+  def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "movrs", Attributes = [NoThrow, Const] in {
+  def prefetchrs : X86Builtin<"void(void const *)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3div2df : X86Builtin<"_Vector<2, double>(_Vector<2, double>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+  def gather3div2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3div4df : X86Builtin<"_Vector<4, double>(_Vector<4, double>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def gather3div4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3div4sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+  def gather3div4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3div8sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def gather3div8si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3siv2df : X86Builtin<"_Vector<2, double>(_Vector<2, double>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3siv4df : X86Builtin<"_Vector<4, double>(_Vector<4, double>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3siv4sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3siv8sf : X86Builtin<"_Vector<8, float>(_Vector<8, float>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gather3siv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def gathersiv8df : X86Builtin<"_Vector<8, double>(_Vector<8, double>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gathersiv16sf : X86Builtin<"_Vector<16, float>(_Vector<16, float>, void const *, _Vector<16, int>, unsigned short, _Constant int)">;
+  def gatherdiv8df : X86Builtin<"_Vector<8, double>(_Vector<8, double>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gatherdiv16sf : X86Builtin<"_Vector<8, float>(_Vector<8, float>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gathersiv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gathersiv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, void const *, _Vector<16, int>, unsigned short, _Constant int)">;
+  def gatherdiv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gatherdiv16si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def scattersiv8df : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, double>, _Constant int)">;
+  def scattersiv16sf : X86Builtin<"void(void *, unsigned short, _Vector<16, int>, _Vector<16, float>, _Constant int)">;
+  def scatterdiv8df : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, double>, _Constant int)">;
+  def scatterdiv16sf : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, float>, _Constant int)">;
+  def scattersiv8di : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, long long int>, _Constant int)">;
+  def scattersiv16si : X86Builtin<"void(void *, unsigned short, _Vector<16, int>, _Vector<16, int>, _Constant int)">;
+  def scatterdiv8di : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def scatterdiv16si : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def knotqi : X86Builtin<"unsigned char(unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def knothi : X86Builtin<"unsigned short(unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def knotsi : X86Builtin<"unsigned int(unsigned int)">;
+  def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def cmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def cmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def cmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def ucmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def ucmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def ucmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
+  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+  def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpconflictsi_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpconflictsi_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
+}
+
+let Features = "avx512cd,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpconflictdi_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
+  def vpconflictsi_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
+  def vplzcntd_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
+  def vplzcntq_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshufbitqmb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshufbitqmb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512bitalg,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pmulhuw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def addpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def addps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def divpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def divps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def mulpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def mulps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def subpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
+  def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def divss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def mulss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def subss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def maxss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def minss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def addsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def divsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def mulsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def subsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def maxsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def minsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressdf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressdf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressdi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressdi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresshi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresshi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresssf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresssf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresssi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresssi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoredf128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoredf256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoredi128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoredi256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstorehi128_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstorehi256_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoreqi128_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoreqi256_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoresf128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoresf256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoresi128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoresi256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+  def cvtpd2ps_mask : X86Builtin<"_Vector<4, float>(_Vector<2, double>, _Vector<4, float>, unsigned char)">;
+  def cvtpd2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2udq256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2udq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttpd2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+  def cvttpd2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2udq256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2udq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expanddf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expanddf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expanddi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expanddi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloaddf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloaddf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloaddi128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloaddi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadsf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadsf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadsi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadsi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandsf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandsf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandsi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandsi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexppd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexppd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexpps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexpps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscalepd_128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscalepd_256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscaleps_128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscaleps_256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scatterdiv2df : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<2, double>, _Constant int)">;
+  def scatterdiv2di : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scatterdiv4df : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, double>, _Constant int)">;
+  def scatterdiv4di : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scatterdiv4sf : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<4, float>, _Constant int)">;
+  def scatterdiv4si : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scatterdiv8sf : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, float>, _Constant int)">;
+  def scatterdiv8si : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scattersiv2df : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<2, double>, _Constant int)">;
+  def scattersiv2di : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scattersiv4df : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, double>, _Constant int)">;
+  def scattersiv4di : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scattersiv4sf : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, float>, _Constant int)">;
+  def scattersiv4si : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scattersiv8sf : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, float>, _Constant int)">;
+  def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovswb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+  def pmovuswb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+  def pmovwb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtqq2ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<2, long long int>, _Vector<4, float>, unsigned char)">;
+  def cvttpd2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttpd2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtuqq2ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<2, long long int>, _Vector<4, float>, unsigned char)">;
+  def rangepd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rangepd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rangeps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rangeps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rangesd128_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def rangess128_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reducepd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reducepd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduceps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduceps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reducesd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def reducess_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovswb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovswb256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, short>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovuswb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovuswb256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, short>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovwb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtpd2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtpd2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtps2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtps2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtqq2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, long long int>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtqq2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, long long int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def cvttpd2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttpd2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttps2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttps2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtuqq2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, long long int>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtuqq2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, long long int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def rangepd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def rangeps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def reducepd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def reduceps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prold512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def prolq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prold128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prold256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prolvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def prolvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def prord512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def prorq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prord128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prord256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prorvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def prorvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psllwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pslldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psllqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrlv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psrlqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrav32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psravq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psravq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psraw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psrawi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+  def psrlw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psrlwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+  def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+  def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa32load128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa32load256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def movdqa32load512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>, _Vector<16, int>, unsigned short)">;
+  def movdqa32store512_mask : X86Builtin<"void(_Vector<16, int *>, _Vector<16, int>, unsigned short)">;
+  def movdqa64load512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>, _Vector<8, long long int>, unsigned char)">;
+  def movdqa64store512_mask : X86Builtin<"void(_Vector<8, long long int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa32store128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa32store256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa64load128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa64load256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa64store128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512ifma,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomisd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>, _Constant int, _Constant int)">;
+  def vcomiss : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kunpckdi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+  def kunpcksi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def loaddquhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>, _Vector<32, short>, unsigned int)">;
+  def loaddquqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fixupimmpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, int>, _Constant int, unsigned short, _Constant int)">;
+  def fixupimmps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, int>, _Constant int, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmsd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmsd_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmss_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char, _Constant int)">;
+  def getexpsd128_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def getexpss128_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def getmantsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char, _Constant int)">;
+  def getmantss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddquhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddquhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddquqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddquqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+  def fixupimmpd128_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fixupimmpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+  def fixupimmpd256_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def fixupimmps128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fixupimmps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def fixupimmps256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadapd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadapd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadaps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadss128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadaps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddqudi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddqudi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddqusi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddqusi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadupd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadupd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadups128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadups256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def storedquhi512_mask : X86Builtin<"void(_Vector<32, short *>, _Vector<32, short>, unsigned int)">;
+  def storedquqi512_mask : X86Builtin<"void(_Vector<64, char *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedquhi128_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedquhi256_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedquqi128_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedquqi256_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeapd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storesd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeapd256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeaps128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storess128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeaps256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedqudi128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedqudi256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedqusi128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedqusi256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeupd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeupd256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeups128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeups256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcp14pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcp14ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vplzcntd_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vplzcntd_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vplzcntq_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vplzcntq_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsd2si32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvtsd2usi32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvtss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvtss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+  def vcvttsd2si32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usi32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermilpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def vpermilps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">;
+  def vpermilvarpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">;
+  def vpermilvarps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscalesd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def rndscaless_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def scalefpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def scalefps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def scalefss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psradi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psraqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psraq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psraq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psraqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psraqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pslld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psllq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psllv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psllv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def psrad512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psraq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psrav16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psrav8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def psrld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psrlq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psrlv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psrlv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def pternlogd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def pternlogd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def pternlogq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+  def pternlogq512_maskz : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pternlogd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def pternlogd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pternlogd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def pternlogd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pternlogq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+  def pternlogq128_maskz : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pternlogq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+  def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def shuf_f32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+  def shuf_i64x2 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def shufpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def shuf_f32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def shuf_f64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shuf_i32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def shuf_i64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def sqrtss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrt14pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrt14ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtb2mask512 : X86Builtin<"unsigned long long int(_Vector<64, char>)">;
+  def cvtmask2b512 : X86Builtin<"_Vector<64, char>(unsigned long long int)">;
+  def cvtmask2w512 : X86Builtin<"_Vector<32, short>(unsigned int)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtd2mask512 : X86Builtin<"unsigned short(_Vector<16, int>)">;
+  def cvtmask2d512 : X86Builtin<"_Vector<16, int>(unsigned short)">;
+  def cvtmask2q512 : X86Builtin<"_Vector<8, long long int>(unsigned char)">;
+  def cvtq2mask512 : X86Builtin<"unsigned char(_Vector<8, long long int>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtb2mask128 : X86Builtin<"unsigned short(_Vector<16, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtb2mask256 : X86Builtin<"unsigned int(_Vector<32, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2b128 : X86Builtin<"_Vector<16, char>(unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2b256 : X86Builtin<"_Vector<32, char>(unsigned int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2w128 : X86Builtin<"_Vector<8, short>(unsigned char)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2w256 : X86Builtin<"_Vector<16, short>(unsigned short)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtd2mask128 : X86Builtin<"unsigned char(_Vector<4, int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtd2mask256 : X86Builtin<"unsigned char(_Vector<8, int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2d128 : X86Builtin<"_Vector<4, int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2d256 : X86Builtin<"_Vector<8, int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2q128 : X86Builtin<"_Vector<2, long long int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2q256 : X86Builtin<"_Vector<4, long long int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtq2mask128 : X86Builtin<"unsigned char(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtq2mask256 : X86Builtin<"unsigned char(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovswb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovswb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovswb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqd256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovuswb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovuswb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovuswb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqd256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovwb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovwb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovwb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+  def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+  def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">;
+  def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">;
+  def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+  def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+  def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf32x8 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<8, float>, _Constant int)">;
+  def insertf64x2_512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<2, double>, _Constant int)">;
+  def inserti32x8 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<8, int>, _Constant int)">;
+  def inserti64x2_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf64x4 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<4, double>, _Constant int)">;
+  def inserti64x4 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def insertf64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
+  def inserti64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def insertf32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
+  def inserti32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<4, float>, _Constant int)">;
+  def inserti32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getmantpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def getmantps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def getexppd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def getexpps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss3_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddss3_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddss3_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddsd3_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmaddsd3_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmaddsd3_mask3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmsubsd3_mask3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmsubss3_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permdf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def permdi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvarhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvardf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">;
+  def permvardi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def permvarsf512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">;
+  def permvarsi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvarqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def permvarqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvarqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def permvarhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvarhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvardf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
+  def permvardi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasspd128_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclasspd256_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclassps128_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclassps256_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fpclassps512_mask : X86Builtin<"unsigned short(_Vector<16, float>, _Constant int, unsigned short)">;
+  def fpclasspd512_mask : X86Builtin<"unsigned char(_Vector<8, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasssd_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Constant int, unsigned char)">;
+  def fpclassss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kaddqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+  def kaddhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kaddsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kadddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kandqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kandhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kandsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kanddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kandnqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kandnhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kandnsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kandndi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def korqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def korhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def korsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kortestchi : X86Builtin<"int(unsigned short, unsigned short)">;
+  def kortestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kortestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def kortestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def kortestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+  def kortestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def ktestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def ktestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def ktestchi : X86Builtin<"int(unsigned short, unsigned short)">;
+  def ktestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def ktestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def ktestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def ktestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+  def ktestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kunpckhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kxnorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kxnorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kxnorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kxnordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kxorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kxorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kxorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kxordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kshiftliqi : X86Builtin<"unsigned char(unsigned char, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kshiftlihi : X86Builtin<"unsigned short(unsigned short, _Constant unsigned int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kshiftlisi : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+  def kshiftlidi : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kshiftriqi : X86Builtin<"unsigned char(unsigned char, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kshiftrihi : X86Builtin<"unsigned short(unsigned short, _Constant unsigned int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kshiftrisi : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+  def kshiftridi : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kmovb : X86Builtin<"unsigned char(unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kmovw : X86Builtin<"unsigned short(unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kmovd : X86Builtin<"unsigned int(unsigned int)">;
+  def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+  def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compressdf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def compressdi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compresshi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, unsigned int)">;
+  def compressqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compresssf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def compresssi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpsd_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Vector<2, double>, _Constant int, unsigned char, _Constant int)">;
+  def cmpss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Vector<4, float>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def expanddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def expanddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def expandhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, unsigned int)">;
+  def expandqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloaddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double const *>, _Vector<8, double>, unsigned char)">;
+  def expandloaddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloadhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>, _Vector<32, short>, unsigned int)">;
+  def expandloadqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloadsf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float const *>, _Vector<16, float>, unsigned short)">;
+  def expandloadsi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def expandsf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def expandsi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned short)">;
+  def cvtps2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, float>, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstoredf512_mask : X86Builtin<"void(_Vector<8, double *>, _Vector<8, double>, unsigned char)">;
+  def compressstoredi512_mask : X86Builtin<"void(_Vector<8, long long int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstorehi512_mask : X86Builtin<"void(_Vector<32, short *>, _Vector<32, short>, unsigned int)">;
+  def compressstoreqi512_mask : X86Builtin<"void(_Vector<64, char *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstoresf512_mask : X86Builtin<"void(_Vector<16, float *>, _Vector<16, float>, unsigned short)">;
+  def compressstoresi512_mask : X86Builtin<"void(_Vector<16, int *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2ps_mask : X86Builtin<"_Vector<4, float>(_Vector<8, short>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, short>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2ph_mask : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2ph256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtw2mask512 : X86Builtin<"unsigned int(_Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtw2mask128 : X86Builtin<"unsigned char(_Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtw2mask256 : X86Builtin<"unsigned short(_Vector<16, short>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtsd2ss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<2, double>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def cvtsi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, int, _Constant int)">;
+  def cvtss2sd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<4, float>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def cvtusi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpmultishiftqb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmultishiftqb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmultishiftqb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtne2ps2bf16_128 : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtne2ps2bf16_256 : X86Builtin<"_Vector<16, __bf16>(_Vector<8, float>, _Vector<8, float>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtne2ps2bf16_512 : X86Builtin<"_Vector<32, __bf16>(_Vector<16, float>, _Vector<16, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtneps2bf16_128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtneps2bf16_256_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, float>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtneps2bf16_512_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, float>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def dpbf16ps_128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def dpbf16ps_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx512bf16", Attributes = [NoThrow, Const] in {
+  def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">;
+}
+
+let Features = "avx512vp2intersect,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vp2intersect_q_256 : X86Builtin<"void(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vp2intersect_q_128 : X86Builtin<"void(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vp2intersect_d_512 : X86Builtin<"void(_Vector<16, int>, _Vector<16, int>, unsigned short *, unsigned short *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vp2intersect_d_256 : X86Builtin<"void(_Vector<8, int>, _Vector<8, int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vp2intersect_d_128 : X86Builtin<"void(_Vector<4, int>, _Vector<4, int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomish : X86Builtin<"int(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def addph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def subph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def mulph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def divph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def maxph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def minph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def minph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def minph128 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def maxph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def maxph128 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def addsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def divsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def mulsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def subsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def maxsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def minsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpph512_mask : X86Builtin<"unsigned int(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpph256_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpph128_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpsh_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsh128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16 const *>, _Vector<8, _Float16>, unsigned char)">;
+  def storesh128_mask : X86Builtin<"void(_Vector<8, _Float16 *>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcpph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rcpph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrtph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrtph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rsqrtph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getmantph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexpph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexpph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getexpph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def scalefph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscaleph_128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscaleph_256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rndscaleph_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduceph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduceph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduceph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpsh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+  def rsqrtsh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+  def getmantsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def getexpsh128_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def scalefsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def rndscalesh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
+  def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclassph128_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclassph256_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fpclassph512_mask : X86Builtin<"unsigned int(_Vector<32, _Float16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasssh_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtpd2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, double>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtpd2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, double>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtpd2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<8, _Float16>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<8, _Float16>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, _Float16>, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2ss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, _Float16>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtss2sh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<4, float>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtsd2sh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<2, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtsh2sd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<8, _Float16>, _Vector<2, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2w128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, _Float16>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2w256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2w512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, _Float16>, _Vector<32, short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2w128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, _Float16>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2w256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2w512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, _Float16>, _Vector<32, short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtw2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, short>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtw2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, short>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtw2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, short>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2uw128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2uw256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2uw512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2uw128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2uw256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2uw512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtuw2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned short>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtuw2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned short>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtuw2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, unsigned short>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<8, _Float16>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2dq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, _Float16>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2udq128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<8, _Float16>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2udq256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2udq512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, _Float16>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtdq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtdq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtdq2ph512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, int>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtudq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtudq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtudq2ph512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned int>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<8, _Float16>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2dq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, _Float16>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2udq128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<8, _Float16>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2udq256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2udq512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, _Float16>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtqq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtqq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtqq2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, _Float16>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, _Float16>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtuqq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, unsigned long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtuqq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtuqq2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2uqq128_mask : X86Builtin<"_Vector<2, unsigned long long int>(_Vector<8, _Float16>, _Vector<2, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2uqq256_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2uqq512_mask : X86Builtin<"_Vector<8, unsigned long long int>(_Vector<8, _Float16>, _Vector<8, unsigned long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, _Float16>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, _Float16>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2uqq128_mask : X86Builtin<"_Vector<2, unsigned long long int>(_Vector<8, _Float16>, _Vector<2, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2uqq256_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2uqq512_mask : X86Builtin<"_Vector<8, unsigned long long int>(_Vector<8, _Float16>, _Vector<8, unsigned long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2si32 : X86Builtin<"int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtsh2usi32 : X86Builtin<"unsigned int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtusi2sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, unsigned int, _Constant int)">;
+  def vcvtsi2sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, int, _Constant int)">;
+  def vcvttsh2si32 : X86Builtin<"int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvttsh2usi32 : X86Builtin<"unsigned int(_Vector<8, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2psx128_mask : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2psx256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2psx512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, _Float16>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2phx128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2phx256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtps2phx512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddph512_maskz : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsubph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddsubph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddsubph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddsubph512_maskz : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddsubph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmsubaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmsubph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsh3_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmaddsh3_maskz : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmaddsh3_mask3 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmsubsh3_mask3 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+  def vfmaddcph128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+  def vfmaddcph256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddcph512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddcph512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfcmaddcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+  def vfcmaddcph128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfcmaddcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+  def vfcmaddcph256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfcmaddcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfcmaddcph512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfcmaddcph512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_round_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_round_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmulcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmulcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmulcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmulcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfcmulcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfcmulcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfcmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectb_128 : X86Builtin<"_Vector<16, char>(unsigned short, _Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectb_256 : X86Builtin<"_Vector<32, char>(unsigned int, _Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectb_512 : X86Builtin<"_Vector<64, char>(unsigned long long int, _Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectw_128 : X86Builtin<"_Vector<8, short>(unsigned char, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectw_256 : X86Builtin<"_Vector<16, short>(unsigned short, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectw_512 : X86Builtin<"_Vector<32, short>(unsigned int, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectd_128 : X86Builtin<"_Vector<4, int>(unsigned char, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectd_256 : X86Builtin<"_Vector<8, int>(unsigned char, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectd_512 : X86Builtin<"_Vector<16, int>(unsigned short, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectph_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectph_256 : X86Builtin<"_Vector<16, _Float16>(unsigned short, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectph_512 : X86Builtin<"_Vector<32, _Float16>(unsigned int, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectpbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectpbf_256 : X86Builtin<"_Vector<16, __bf16>(unsigned short, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectpbf_512 : X86Builtin<"_Vector<32, __bf16>(unsigned int, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectq_128 : X86Builtin<"_Vector<2, long long int>(unsigned char, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectq_256 : X86Builtin<"_Vector<4, long long int>(unsigned char, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectq_512 : X86Builtin<"_Vector<8, long long int>(unsigned char, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectps_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectps_256 : X86Builtin<"_Vector<8, float>(unsigned char, _Vector<8, float>, _Vector<8, float>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectps_512 : X86Builtin<"_Vector<16, float>(unsigned short, _Vector<16, float>, _Vector<16, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectpd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectpd_256 : X86Builtin<"_Vector<4, double>(unsigned char, _Vector<4, double>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectpd_512 : X86Builtin<"_Vector<8, double>(unsigned char, _Vector<8, double>, _Vector<8, double>)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectsh_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectsbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectss_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">;
+  def selectsd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fadd_pd512 : X86Builtin<"double(double, _Vector<8, double>)">;
+  def reduce_fadd_ps512 : X86Builtin<"float(float, _Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fadd_ph512 : X86Builtin<"_Float16(_Float16, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fadd_ph256 : X86Builtin<"_Float16(_Float16, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fadd_ph128 : X86Builtin<"_Float16(_Float16, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmax_pd512 : X86Builtin<"double(_Vector<8, double>)">;
+  def reduce_fmax_ps512 : X86Builtin<"float(_Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmax_ph512 : X86Builtin<"_Float16(_Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmax_ph256 : X86Builtin<"_Float16(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmax_ph128 : X86Builtin<"_Float16(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmin_pd512 : X86Builtin<"double(_Vector<8, double>)">;
+  def reduce_fmin_ps512 : X86Builtin<"float(_Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmin_ph512 : X86Builtin<"_Float16(_Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmin_ph256 : X86Builtin<"_Float16(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmin_ph128 : X86Builtin<"_Float16(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmul_pd512 : X86Builtin<"double(double, _Vector<8, double>)">;
+  def reduce_fmul_ps512 : X86Builtin<"float(float, _Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmul_ph512 : X86Builtin<"_Float16(_Float16, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmul_ph256 : X86Builtin<"_Float16(_Float16, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmul_ph128 : X86Builtin<"_Float16(_Float16, _Vector<8, _Float16>)">;
+}
+
+let Features = "mwaitx", Attributes = [NoThrow] in {
+  def monitorx : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
+  def mwaitx : X86Builtin<"void(unsigned int, unsigned int, unsigned int)">;
+}
+
+let Features = "waitpkg", Attributes = [NoThrow] in {
+  def umonitor : X86Builtin<"void(void const *)">;
+  def umwait : X86Builtin<"unsigned char(unsigned int, unsigned int, unsigned int)">;
+  def tpause : X86Builtin<"unsigned char(unsigned int, unsigned int, unsigned int)">;
+}
+
+let Features = "clzero", Attributes = [NoThrow] in {
+  def clzero : X86Builtin<"void(void *)">;
+}
+
+let Features = "cldemote", Attributes = [NoThrow] in {
+  def cldemote : X86Builtin<"void(void const *)">;
+}
+
+let Features = "movdiri", Attributes = [NoThrow] in {
+  def directstore_u32 : X86Builtin<"void(unsigned int *, unsigned int)">;
+}
+
+let Features = "movdir64b", Attributes = [NoThrow] in {
+  def movdir64b : X86Builtin<"void(void *, void const *)">;
+}
+
+let Features = "ptwrite", Attributes = [NoThrow] in {
+  def ptwrite32 : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "invpcid", Attributes = [NoThrow, Const] in {
+  def invpcid : X86Builtin<"void(unsigned int, void *)">;
+}
+
+let Features = "enqcmd", Attributes = [NoThrow] in {
+  def enqcmd : X86Builtin<"unsigned char(void *, void const *)">;
+  def enqcmds : X86Builtin<"unsigned char(void *, void const *)">;
+}
+
+let Features = "kl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadiwkey : X86Builtin<"void(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, unsigned int)">;
+  def encodekey128_u32 : X86Builtin<"unsigned int(unsigned int, _Vector<2, long long int>, void *)">;
+  def encodekey256_u32 : X86Builtin<"unsigned int(unsigned int, _Vector<2, long long int>, _Vector<2, long long int>, void *)">;
+  def aesenc128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesenc256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesdec128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesdec256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+}
+
+let Features = "kl,widekl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def aesencwide128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesencwide256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesdecwide128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesdecwide256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+}
+
+let Features = "serialize", Attributes = [NoThrow] in {
+  def serialize : X86Builtin<"void()">;
+}
+
+let Features = "tsxldtrk", Attributes = [NoThrow] in {
+  def xsusldtrk : X86Builtin<"void()">;
+  def xresldtrk : X86Builtin<"void()">;
+}
+
+let Features = "raoint", Attributes = [NoThrow] in {
+  def aadd32 : X86Builtin<"void(void *, signed int)">;
+  def aand32 : X86Builtin<"void(void *, signed int)">;
+  def aor32 : X86Builtin<"void(void *, signed int)">;
+  def axor32 : X86Builtin<"void(void *, signed int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _BitScanForward : X86LibBuiltin<"unsigned char(msuint32_t *, msuint32_t)">;
+  def _BitScanReverse : X86LibBuiltin<"unsigned char(msuint32_t *, msuint32_t)">;
+  def _ReadWriteBarrier : X86LibBuiltin<"void()">;
+  def _ReadBarrier : X86LibBuiltin<"void()">;
+  def _WriteBarrier : X86LibBuiltin<"void()">;
+  def __cpuid : X86LibBuiltin<"void(int *, int)">;
+  def __cpuidex : X86LibBuiltin<"void(int *, int, int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in {
+  def __emul : X86LibBuiltin<"long long int(int, int)">;
+  def __emulu : X86LibBuiltin<"unsigned long long int(unsigned int, unsigned int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _AddressOfReturnAddress : X86LibBuiltin<"void *()">;
+  def __stosb : X86LibBuiltin<"void(unsigned char *, unsigned char, size_t)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration, NoReturn] in {
+  def __int2c : X86LibBuiltin<"void()">;
+  def __ud2 : X86LibBuiltin<"void()">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def __readfsbyte : X86LibBuiltin<"unsigned char(msuint32_t)">;
+  def __readfsword : X86LibBuiltin<"unsigned short(msuint32_t)">;
+  def __readfsdword : X86LibBuiltin<"msuint32_t(msuint32_t)">;
+  def __readfsqword : X86LibBuiltin<"unsigned long long int(msuint32_t)">;
+  def __readgsbyte : X86LibBuiltin<"unsigned char(msuint32_t)">;
+  def __readgsword : X86LibBuiltin<"unsigned short(msuint32_t)">;
+  def __readgsdword : X86LibBuiltin<"msuint32_t(msuint32_t)">;
+  def __readgsqword : X86LibBuiltin<"unsigned long long int(msuint32_t)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vdpphps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vdpphps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
+  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def mpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vaddpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vaddph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vaddps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vcmppd256_round_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Vector<4, double>, _Constant int, unsigned char, _Constant int)">;
+  def vcmpph256_round_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, unsigned short, _Constant int)">;
+  def vcmpps256_round_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Vector<8, float>, _Constant int, unsigned char, _Constant int)">;
+  def vcvtdq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtdq2ps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtpd2dq256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+  def vcvtpd2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtpd2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, double>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtpd2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtpd2udq256_round_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, double>, _Vector<4, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtpd2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, double>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtph2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvtph2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<8, _Float16>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtph2psx256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtph2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtph2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtph2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtph2uw256_round_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+  def vcvtph2w256_round_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short, _Constant int)">;
+  def vcvtps2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvtps2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, float>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtps2phx256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, float>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtps2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtps2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtps2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, float>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtqq2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, long long int>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtqq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtqq2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, long long int>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvttpd2dq256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+  def vcvttpd2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttpd2udq256_round_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, double>, _Vector<4, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttpd2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, double>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvttph2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvttph2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttph2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttph2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvttph2uw256_round_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+  def vcvttph2w256_round_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short, _Constant int)">;
+  def vcvttps2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvttps2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttps2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttps2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, float>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtudq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtudq2ps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, unsigned int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtuqq2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, unsigned long long int>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtuqq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtuqq2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, unsigned long long int>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtuw2ph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned short>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vcvtw2ph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, short>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vdivpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vdivph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vdivps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vfcmaddcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmaddcph256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmaddcph256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmulcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfixupimmpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddph256_round_maskz : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubph256_round_maskz : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmsubpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmsubph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmsubps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmsubaddpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmsubaddph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmsubaddps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmulcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vgetexppd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vgetexpph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vgetexpps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vgetmantpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vgetmantph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vgetmantps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vmaxpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vmaxph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vmaxps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vminpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vminph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vminps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vmulpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vmulph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vmulps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vrangepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vrangeps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vreducepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vreduceph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vreduceps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vrndscalepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vrndscaleph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vrndscaleps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vscalefpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vscalefph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vscalefps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vsqrtpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def vsqrtph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int)">;
+  def vsqrtps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+  def vsubpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vsubph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vsubps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttsd2sis32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usis32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2sis32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usis32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2dqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2dqs256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2dqs512_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2udqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2udqs256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2udqs512_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2qqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2qqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2qqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2uqqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2uqqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2uqqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2dqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2dqs256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2dqs512_round_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2udqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2udqs256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2udqs512_round_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2qqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2qqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2qqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2uqqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2uqqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2uqqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vbcstnebf162ps128 : X86Builtin<"_Vector<4, float>(__bf16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vbcstnebf162ps256 : X86Builtin<"_Vector<8, float>(__bf16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vbcstnesh2ps128 : X86Builtin<"_Vector<4, float>(_Float16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vbcstnesh2ps256 : X86Builtin<"_Vector<8, float>(_Float16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneebf162ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneebf162ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneeph2ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneeph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneobf162ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneobf162ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneoph2ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneoph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, _Float16 const *>)">;
+}
+
+let Features = "avx512bf16,avx512vl|avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneps2bf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl|avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneps2bf16256 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, float>)">;
+}
+
+let Features = "sha512", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsha512msg1 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<2, unsigned long long int>)">;
+  def vsha512msg2 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<4, unsigned long long int>)">;
+  def vsha512rnds2 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<4, unsigned long long int>, _Vector<2, unsigned long long int>)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _InterlockedAnd64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedDecrement64 : X86LibBuiltin<"int64_t(int64_t volatile *)">;
+  def _InterlockedExchange64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedExchangeAdd64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedExchangeSub64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedIncrement64 : X86LibBuiltin<"int64_t(int64_t volatile *)">;
+  def _InterlockedOr64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedXor64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+}
+
+let Features = "sm3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm3msg1 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+  def vsm3msg2 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+  def vsm3rnds2 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>, _Constant unsigned int)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm4key4128 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsm4key4256 : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, unsigned int>, _Vector<8, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm4rnds4128 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsm4rnds4256 : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, unsigned int>, _Vector<8, unsigned int>)">;
+}
+
+let Features = "avx10.2-512,sm4", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vsm4key4512 : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, unsigned int>, _Vector<16, unsigned int>)">;
+  def vsm4rnds4512 : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, unsigned int>, _Vector<16, unsigned int>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxpd512_round_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxph512_round_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxps512_round_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vminmaxsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vminmaxss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtnebf162ibs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtnebf162ibs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtnebf162ibs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtnebf162iubs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtnebf162iubs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtnebf162iubs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtph2ibs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtph2ibs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtph2ibs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtph2iubs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtph2iubs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtph2iubs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtps2ibs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtps2ibs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtps2ibs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtps2iubs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtps2iubs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtps2iubs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttnebf162ibs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttnebf162ibs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttnebf162ibs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttnebf162iubs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttnebf162iubs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttnebf162iubs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttph2ibs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttph2ibs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttph2ibs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttph2iubs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttph2iubs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttph2iubs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2ibs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2ibs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2ibs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2iubs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2iubs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2iubs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvt2ps2phx128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, float>, _Vector<4, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvt2ps2phx512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<16, float>, _Vector<16, float>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2bf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2bf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2bf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2bf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2bf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2bf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2hf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2hf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2hf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2hf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2hf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2hf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvthf8_2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<16, char>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvthf8_2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, char>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvthf8_2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, char>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16 const *>, _Vector<8, __bf16>, unsigned char)">;
+  def storesbf16128_mask : X86Builtin<"void(_Vector<8, __bf16 *>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vaddnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vaddnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vaddnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vdivnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vdivnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vdivnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vmaxpbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vmaxpbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vmaxpbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vminpbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vminpbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vminpbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vmulnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vmulnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vmulnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vsubnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vsubnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vsubnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomsbf16eq : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16lt : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16neq : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16ge : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16gt : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16le : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcmppbf16512_mask : X86Builtin<"unsigned int(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcmppbf16256_mask : X86Builtin<"unsigned short(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcmppbf16128_mask : X86Builtin<"unsigned char(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int, unsigned char)">;
+  def vfpclasspbf16128_mask : X86Builtin<"unsigned char(_Vector<8, __bf16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfpclasspbf16256_mask : X86Builtin<"unsigned short(_Vector<16, __bf16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfpclasspbf16512_mask : X86Builtin<"unsigned int(_Vector<32, __bf16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vscalefpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vscalefpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vscalefpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrcppbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrcppbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrcppbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgetexppbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgetexppbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgetexppbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrsqrtpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrsqrtpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrsqrtpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vreducenepbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vreducenepbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vreducenepbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrndscalenepbf16_128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrndscalenepbf16_256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrndscalenepbf16_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgetmantpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgetmantpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgetmantpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vsqrtnepbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vsqrtnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vsqrtnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
+  def vfmaddnepbh512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddnepbh256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddnepbh128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index a14fd2c4b224d..556332dd4b217 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -124,8 +124,6 @@ namespace clang {
   enum {
     LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsX86.def"
-#define BUILTIN(ID, TYPE, ATTRS) BI##ID,
 #include "clang/Basic/BuiltinsX86.inc"
     FirstX86_64Builtin,
     LastX86CommonBuiltin = FirstX86_64Builtin - 1,
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 1b16888a0711b..7e5a5c78aa6b5 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -24,14 +24,6 @@ namespace clang {
 namespace targets {
 
 static constexpr Builtin::Info BuiltinInfoX86[] = {
-#define BUILTIN(ID, TYPE, ATTRS)                                               \
-  {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
-  {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE)         \
-  {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::HEADER, LANGS},
-#include "clang/Basic/BuiltinsX86.def"
-
 #define BUILTIN(ID, TYPE, ATTRS)                                               \
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
 #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 6c3604adc92b9..94cc218376002 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -25,12 +25,14 @@ enum class BuiltinType {
   LibBuiltin,
   LangBuiltin,
   TargetBuiltin,
+  TargetLibBuiltin,
 };
 
 class PrototypeParser {
 public:
   PrototypeParser(StringRef Substitution, const Record *Builtin)
-      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution) {
+      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution),
+        EnableOpenCLLong(Builtin->getValueAsBit("EnableOpenCLLong")) {
     ParsePrototype(Builtin->getValueAsString("Prototype"));
   }
 
@@ -108,9 +110,15 @@ class PrototypeParser {
     } else if (T.consume_back("&")) {
       ParseType(T);
       Type += "&";
+    } else if (EnableOpenCLLong && T.consume_front("long long")) {
+      Type += "O";
+      ParseType(T);
     } else if (T.consume_front("long")) {
       Type += "L";
       ParseType(T);
+    } else if (T.consume_front("signed")) {
+      Type += "S";
+      ParseType(T);
     } else if (T.consume_front("unsigned")) {
       Type += "U";
       ParseType(T);
@@ -155,6 +163,7 @@ class PrototypeParser {
                                .Case("__fp16", "h")
                                .Case("__int128_t", "LLLi")
                                .Case("_Float16", "x")
+                               .Case("__bf16", "y")
                                .Case("bool", "b")
                                .Case("char", "c")
                                .Case("constant_CFString", "F")
@@ -194,6 +203,7 @@ class PrototypeParser {
 private:
   SMLoc Loc;
   StringRef Substitution;
+  bool EnableOpenCLLong;
   std::string Type;
 };
 
@@ -262,6 +272,9 @@ void EmitBuiltinDef(raw_ostream &OS, StringRef Substitution,
   case BuiltinType::TargetBuiltin:
     OS << "TARGET_BUILTIN";
     break;
+  case BuiltinType::TargetLibBuiltin:
+    OS << "TARGET_HEADER_BUILTIN";
+    break;
   }
 
   OS << "(" << Spelling;
@@ -279,6 +292,12 @@ void EmitBuiltinDef(raw_ostream &OS, StringRef Substitution,
     OS << ", " << Builtin->getValueAsString("Languages");
     break;
   }
+  case BuiltinType::TargetLibBuiltin: {
+    OS << ", ";
+    HeaderNameParser{Builtin}.Print(OS);
+    OS << ", " << Builtin->getValueAsString("Languages");
+    [[fallthrough]];
+  }
   case BuiltinType::TargetBuiltin:
     OS << ", \"" << Builtin->getValueAsString("Features") << "\"";
     break;
@@ -331,6 +350,8 @@ void EmitBuiltin(raw_ostream &OS, const Record *Builtin) {
         BT = BuiltinType::AtomicBuiltin;
       } else if (Builtin->isSubClassOf("LangBuiltin")) {
         BT = BuiltinType::LangBuiltin;
+      } else if (Builtin->isSubClassOf("TargetLibBuiltin")) {
+        BT = BuiltinType::TargetLibBuiltin;
       } else if (Builtin->isSubClassOf("TargetBuiltin")) {
         BT = BuiltinType::TargetBuiltin;
       } else if (Builtin->isSubClassOf("LibBuiltin")) {
@@ -367,6 +388,10 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
 #if defined(BUILTIN) && !defined(TARGET_BUILTIN)
 #  define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
 #endif
+
+#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
+#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
+#endif
 )c++";
 
   // AtomicBuiltins are order dependent
@@ -390,5 +415,6 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
 #undef LIBBUILTIN
 #undef LANGBUILTIN
 #undef TARGET_BUILTIN
+#undef TARGET_HEADER_BUILTIN
 )c++";
 }

From 95db1116c5718004e0bd7c3b79d39987fdbbff32 Mon Sep 17 00:00:00 2001
From: David CARLIER 
Date: Sat, 4 Jan 2025 10:52:41 +0000
Subject: [PATCH 148/480] [compiler-rt][rtsan] intercept setbuf, setvbuf,
 setlinebuf and setbuffer (#121616)

---
 .../lib/rtsan/rtsan_interceptors_posix.cpp    | 35 +++++++++++++
 .../tests/rtsan_test_interceptors_posix.cpp   | 50 +++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 9f89ab6bf1fc7..227d077290af7 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -325,6 +325,37 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) {
 #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+INTERCEPTOR(void, setbuf, FILE *stream, char *buf) {
+  __rtsan_notify_intercepted_call("setbuf");
+  return REAL(setbuf)(stream, buf);
+}
+
+INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, size_t size) {
+  __rtsan_notify_intercepted_call("setbuffer");
+  return REAL(setbuffer)(stream, buf, size);
+}
+
+INTERCEPTOR(void, setlinebuf, FILE *stream) {
+  __rtsan_notify_intercepted_call("setlinebuf");
+  return REAL(setlinebuf)(stream);
+}
+
+INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) {
+  __rtsan_notify_intercepted_call("setvbuf");
+  return REAL(setvbuf)(stream, buf, mode, size);
+}
+#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf)
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER INTERCEPT_FUNCTION(setbuffer)
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF INTERCEPT_FUNCTION(setlinebuf)
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SETBUF
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF
+#endif
+
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -986,6 +1017,10 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
   RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM;
   RTSAN_MAYBE_INTERCEPT_FMEMOPEN;
+  RTSAN_MAYBE_INTERCEPT_SETBUF;
+  RTSAN_MAYBE_INTERCEPT_SETBUFFER;
+  RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
+  RTSAN_MAYBE_INTERCEPT_SETVBUF;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 5adbf0fb63de8..2947510b2cfde 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -403,6 +403,56 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) {
 }
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+TEST_F(RtsanFileTest, SetbufDieWhenRealtime) {
+  char buffer[BUFSIZ];
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f, &buffer]() { setbuf(f, buffer); };
+
+  ExpectRealtimeDeath(Func, "setbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetbufferDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f, &buffer, &size]() { setbuffer(f, buffer, size); };
+
+  ExpectRealtimeDeath(Func, "setbuffer");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f, &buffer, &size]() {
+    int r = setvbuf(f, buffer, _IOFBF, size);
+    EXPECT_THAT(r, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, "setvbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetlinebufDieWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [&f]() { setlinebuf(f); };
+
+  ExpectRealtimeDeath(Func, "setlinebuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 class RtsanOpenedFileTest : public RtsanFileTest {
 protected:
   void SetUp() override {

From c7fa3cf620f62d87dc7753f5d341ae3f63da87f4 Mon Sep 17 00:00:00 2001
From: David CARLIER 
Date: Sat, 4 Jan 2025 11:35:31 +0000
Subject: [PATCH 149/480] =?UTF-8?q?Revert=20"[compiler-rt][rtsan]=20interc?=
 =?UTF-8?q?ept=20setbuf,=20setvbuf,=20setlinebuf=20an=E2=80=A6=20(#121639)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…d setbuffer (#121616)"

This reverts commit 95db1116c5718004e0bd7c3b79d39987fdbbff32.
---
 .../lib/rtsan/rtsan_interceptors_posix.cpp    | 35 -------------
 .../tests/rtsan_test_interceptors_posix.cpp   | 50 -------------------
 2 files changed, 85 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 227d077290af7..9f89ab6bf1fc7 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -325,37 +325,6 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) {
 #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN
 #endif
 
-#if SANITIZER_INTERCEPT_SETVBUF
-INTERCEPTOR(void, setbuf, FILE *stream, char *buf) {
-  __rtsan_notify_intercepted_call("setbuf");
-  return REAL(setbuf)(stream, buf);
-}
-
-INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, size_t size) {
-  __rtsan_notify_intercepted_call("setbuffer");
-  return REAL(setbuffer)(stream, buf, size);
-}
-
-INTERCEPTOR(void, setlinebuf, FILE *stream) {
-  __rtsan_notify_intercepted_call("setlinebuf");
-  return REAL(setlinebuf)(stream);
-}
-
-INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) {
-  __rtsan_notify_intercepted_call("setvbuf");
-  return REAL(setvbuf)(stream, buf, mode, size);
-}
-#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf)
-#define RTSAN_MAYBE_INTERCEPT_SETBUFFER INTERCEPT_FUNCTION(setbuffer)
-#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF INTERCEPT_FUNCTION(setlinebuf)
-#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf)
-#else
-#define RTSAN_MAYBE_INTERCEPT_SETBUF
-#define RTSAN_MAYBE_INTERCEPT_SETBUFFER
-#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF
-#define RTSAN_MAYBE_INTERCEPT_SETVBUF
-#endif
-
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -1017,10 +986,6 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
   RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM;
   RTSAN_MAYBE_INTERCEPT_FMEMOPEN;
-  RTSAN_MAYBE_INTERCEPT_SETBUF;
-  RTSAN_MAYBE_INTERCEPT_SETBUFFER;
-  RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
-  RTSAN_MAYBE_INTERCEPT_SETVBUF;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 2947510b2cfde..5adbf0fb63de8 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -403,56 +403,6 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) {
 }
 #endif
 
-#if SANITIZER_INTERCEPT_SETVBUF
-TEST_F(RtsanFileTest, SetbufDieWhenRealtime) {
-  char buffer[BUFSIZ];
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f, &buffer]() { setbuf(f, buffer); };
-
-  ExpectRealtimeDeath(Func, "setbuf");
-  ExpectNonRealtimeSurvival(Func);
-}
-
-TEST_F(RtsanFileTest, SetbufferDieWhenRealtime) {
-  char buffer[1024];
-  size_t size = sizeof(buffer);
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f, &buffer, &size]() { setbuffer(f, buffer, size); };
-
-  ExpectRealtimeDeath(Func, "setbuffer");
-  ExpectNonRealtimeSurvival(Func);
-}
-
-TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) {
-  char buffer[1024];
-  size_t size = sizeof(buffer);
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f, &buffer, &size]() {
-    int r = setvbuf(f, buffer, _IOFBF, size);
-    EXPECT_THAT(r, Eq(0));
-  };
-
-  ExpectRealtimeDeath(Func, "setvbuf");
-  ExpectNonRealtimeSurvival(Func);
-}
-
-TEST_F(RtsanFileTest, SetlinebufDieWhenRealtime) {
-  FILE *f = fopen(GetTemporaryFilePath(), "w");
-  EXPECT_THAT(f, Ne(nullptr));
-
-  auto Func = [&f]() { setlinebuf(f); };
-
-  ExpectRealtimeDeath(Func, "setlinebuf");
-  ExpectNonRealtimeSurvival(Func);
-}
-#endif
-
 class RtsanOpenedFileTest : public RtsanFileTest {
 protected:
   void SetUp() override {

From 914fd916d5456e15cf9baaf617edaac6b7334d09 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser 
Date: Sat, 4 Jan 2025 14:49:22 +0100
Subject: [PATCH 150/480] [libc++][NFC] Simplify basic_ostream by combining
 operator<<(Arithmetic) (#121011)

The bodies of all the `operator<<` for arithmetic types have very
similar or even identical bodies. This introduces two new functions to
avoid all the duplication.
---
 libcxx/include/__ostream/basic_ostream.h | 284 +++++------------------
 1 file changed, 62 insertions(+), 222 deletions(-)

diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h
index cf4d26167aebd..97226476e5ef0 100644
--- a/libcxx/include/__ostream/basic_ostream.h
+++ b/libcxx/include/__ostream/basic_ostream.h
@@ -88,6 +88,55 @@ class _LIBCPP_TEMPLATE_VIS basic_ostream : virtual public basic_ios<_CharT, _Tra
     return *this;
   }
 
+  template 
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& __put_num(_Tp __value) {
+#  if _LIBCPP_HAS_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      sentry __s(*this);
+      if (__s) {
+        using _Fp          = num_put >;
+        const _Fp& __facet = std::use_facet<_Fp>(this->getloc());
+        if (__facet.put(*this, *this, this->fill(), __value).failed())
+          this->setstate(ios_base::badbit | ios_base::failbit);
+      }
+#  if _LIBCPP_HAS_EXCEPTIONS
+    } catch (...) {
+      this->__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+    return *this;
+  }
+
+  template 
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& __put_num_integer_promote(_Tp __value) {
+#  if _LIBCPP_HAS_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      sentry __s(*this);
+      if (__s) {
+        ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
+
+        using _Fp          = num_put >;
+        const _Fp& __facet = std::use_facet<_Fp>(this->getloc());
+        if (__facet
+                .put(*this,
+                     *this,
+                     this->fill(),
+                     __flags == ios_base::oct || __flags == ios_base::hex
+                         ? static_cast<__copy_unsigned_t<_Tp, long> >(std::__to_unsigned_like(__value))
+                         : static_cast<__copy_unsigned_t<_Tp, long> >(__value))
+                .failed())
+          this->setstate(ios_base::badbit | ios_base::failbit);
+      }
+#  if _LIBCPP_HAS_EXCEPTIONS
+    } catch (...) {
+      this->__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+    return *this;
+  }
+
   basic_ostream& operator<<(bool __n);
   basic_ostream& operator<<(short __n);
   basic_ostream& operator<<(unsigned short __n);
@@ -225,276 +274,67 @@ basic_ostream<_CharT, _Traits>::operator<<(basic_streambuf
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(bool __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(short __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast(static_cast(__n))
-                      : static_cast(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned short __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(int __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast(static_cast(__n))
-                      : static_cast(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned int __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(float __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return *this << static_cast(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(double __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long double __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(const void* __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template 

From c9d61cde2ba3521c7604c8ee0c3e1ba4dfc4d406 Mon Sep 17 00:00:00 2001
From: Matthias Springer 
Date: Sat, 4 Jan 2025 15:16:35 +0100
Subject: [PATCH 151/480] [mlir][Transforms][NFC] Delete unused
 `nTo1TempMaterializations` (#121647)

`nTo1TempMaterializations` is no longer used since the conversion value
mapping supports 1:N mappings.
---
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 1e689cd96ae71..0e577d2d39de3 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1040,10 +1040,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   DenseMap
       unresolvedMaterializations;
 
-  /// A set of all N:1 materializations that were added to work around
-  /// incomplete 1:N support in the dialect conversion driver.
-  DenseSet nTo1TempMaterializations;
-
   /// The current type converter, or nullptr if no type converter is currently
   /// active.
   const TypeConverter *currentTypeConverter = nullptr;
@@ -1180,7 +1176,6 @@ void UnresolvedMaterializationRewrite::rollback() {
   if (!mappedValues.empty())
     rewriterImpl.mapping.erase(mappedValues);
   rewriterImpl.unresolvedMaterializations.erase(getOperation());
-  rewriterImpl.nTo1TempMaterializations.erase(getOperation());
   op->erase();
 }
 

From 47ac7fa8619c1f1e29ee4aafded2ae990ffa319e Mon Sep 17 00:00:00 2001
From: Florian Hahn 
Date: Sat, 4 Jan 2025 14:50:04 +0000
Subject: [PATCH 152/480] [LV] Add tests with wide inductions and live-in step.

Also regenerate check lines and simplify existing tests and names.
---
 .../LoopVectorize/induction-step.ll           | 513 +++++++++++++-----
 1 file changed, 387 insertions(+), 126 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index ecb00d4724488..f553864b5fc38 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -1,21 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
 
 ; int int_inc;
 ;
-;int induction_with_global(int init, int *restrict A, int N) {
+;void induction_with_global(int init, int *restrict A, int N) {
 ;  int x = init;
 ;  for (int i=0;i poison, i32 %init, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INIT]], [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INIT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
@@ -24,53 +42,56 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP10]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-
-@int_inc = common global i32 0, align 4
-
-define i32 @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP0]], [[X_05]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
   %0 = load i32, ptr @int_inc, align 4
-  %1 = mul i32 %0, %N
   br label %for.body
 
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.05, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %x.05
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %2 = add i32 %1, %init
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
-  ret i32 %x.0.lcssa
-}
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %iv.next.trunc, %N
+  br i1 %exitcond, label %exit, label %for.body
 
+exit:
+  ret void
+}
 
 ;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
 ;  int x = init;
@@ -83,82 +104,123 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %x.011, i64 0
+define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define i32 @induction_with_loop_inv(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV15:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT16:%.*]], %[[OUTER_LATCH:.*]] ]
+; CHECK-NEXT:    [[J_012:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC5:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[X_011:%.*]] = phi i32 [ [[INIT]], %[[ENTRY]] ], [ [[X_0_LCSSA:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[DOTCAST]], [[J_012]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[X_011]], [[TMP1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[X_011]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 %j.012, i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[J_012]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> , [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[J_012]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP8]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[INNER_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[X_011]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[INNER:.*]]
+; CHECK:       [[INNER]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[X_18:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_18]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[X_18]], [[J_012]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[INNER_EXIT]], label %[[INNER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[INNER_EXIT]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[X_011]], [[INDVARS_IV15]]
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[X_0_LCSSA]] = phi i32 [ [[TMP9]], %[[INNER_EXIT]] ]
+; CHECK-NEXT:    [[INC5]] = add nuw nsw i32 [[J_012]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT16]] = add i32 [[INDVARS_IV15]], [[N]]
+; CHECK-NEXT:    [[EXITCOND17:%.*]] = icmp eq i32 [[INC5]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND17]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
 entry:
-  %cmp10 = icmp sgt i32 %M, 0
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %cmp27 = icmp sgt i32 %N, 0
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
-  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
-  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
-  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
-  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
-
-for.body3.preheader:                              ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %j.012 = phi i32 [ 0, %entry ], [ %inc5, %outer.latch ]
+  %x.011 = phi i32 [ %init, %entry ], [ %x.1.lcssa, %outer.latch ]
+  br label %inner
+
+inner:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %x.18 = phi i32 [ %x.011, %outer.header ], [ %add, %inner ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.18, ptr %arrayidx, align 4
   %add = add nsw i32 %x.18, %j.012
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %inner.ec = icmp eq i32 %iv.next.trunc, %N
+  br i1 %inner.ec, label %inner.exit, label %inner
 
-for.inc4.loopexit:                                ; preds = %for.body3
-  %0 = add i32 %x.011, %indvars.iv15
-  br label %for.inc4
+inner.exit:
+  %add.ivs  = add i32 %x.011, %outer.iv
+  br label %outer.latch
 
-for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
-  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+outer.latch:
+  %x.1.lcssa = phi i32 [ %add.ivs, %inner.exit ]
   %inc5 = add nuw nsw i32 %j.012, 1
-  %indvars.iv.next16 = add i32 %indvars.iv15, %N
-  %exitcond17 = icmp eq i32 %inc5, %M
-  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
-
-for.end6.loopexit:                                ; preds = %for.inc4
-  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
-  br label %for.end6
+  %outer.iv.next = add i32 %outer.iv, %N
+  %outer.ec = icmp eq i32 %inc5, %M
+  br i1 %outer.ec, label %exit, label %outer.header
 
-for.end6:                                         ; preds = %for.end6.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
-  ret i32 %x.0.lcssa
+exit:
+  ret i32 %x.1.lcssa
 }
 
-
-; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
-; CHECK:       vector.ph:
-; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-LABEL: define void @non_primary_iv_loop_inv_trunc(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i64 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[STEP]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> , [[DOTSPLAT6]]
@@ -166,19 +228,38 @@ for.end6:                                         ; preds = %for.end6.loopexit,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 8
-; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[J]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], [[STEP]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -197,22 +278,43 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: @iv_no_binary_op_in_descriptor(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-LABEL: define void @iv_no_binary_op_in_descriptor(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label %middle.block, label [[VECTOR_BODY]]
-
-define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT_P]] = phi i64 [ [[IV_NEXT]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT_P]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -231,3 +333,162 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_add_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i16> , [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[O_1]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_sub_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> , [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.

From da2a9ede81a88bea0bba28a543441197772e4727 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov 
Date: Sat, 4 Jan 2025 18:53:01 +0400
Subject: [PATCH 153/480] [clang] Update C++ DR status page (#121642)

This patch brings our C++ DR status page up to date with WG21 updates.

[CWG1223](https://cplusplus.github.io/CWG/issues/1223.html) "Syntactic
disambiguation and _trailing-return-types_" is resolved by
[P2915R0](https://wg21.link/p2915r0) "Proposed resolution for CWG1223".
Both the test and the paper were written by @cor3ntin, so I presume no
updates are needed.

[CWG2819](https://cplusplus.github.io/CWG/issues/2819.html) "Cast from
null pointer value in a constant expression" was revisited and marked as
not a DR, so I updated the test to ensure that the example is not
accepted in C++23 and earlier modes. CC @offsetof.

Tentantive resolutions to the following issues were simply promoted to
actual resolutions, so tests don't require any meaningful changes:
- [CWG2913](https://cplusplus.github.io/CWG/issues/2913.html) "Grammar
for deduction-guide has requires-clause in the wrong position"
(@zyn0217)
- [CWG2915](https://cplusplus.github.io/CWG/issues/2915.html) "Explicit
object parameters of type void" (@MitalAshok)
- [CWG2922](https://cplusplus.github.io/CWG/issues/2922.html) "constexpr
placement-new is too permissive" (@cor3ntin)

As a drive-by fix, I updated the `make_cxx_dr_status` script to
accommodate for `C++23 onwards` and `C++26 onwards` statuses, which are
useful for Core issues that are not DRs.
---
 clang/test/CXX/drs/cwg12xx.cpp |   2 +-
 clang/test/CXX/drs/cwg28xx.cpp |  25 +-
 clang/test/CXX/drs/cwg29xx.cpp |   6 +-
 clang/www/cxx_dr_status.html   | 534 ++++++++++++++++++++++++---------
 clang/www/make_cxx_dr_status   |   6 +
 5 files changed, 413 insertions(+), 160 deletions(-)

diff --git a/clang/test/CXX/drs/cwg12xx.cpp b/clang/test/CXX/drs/cwg12xx.cpp
index cdfbc6d672658..951c71a9832de 100644
--- a/clang/test/CXX/drs/cwg12xx.cpp
+++ b/clang/test/CXX/drs/cwg12xx.cpp
@@ -32,7 +32,7 @@ namespace cwg1213 { // cwg1213: 7
 }
 
 #if __cplusplus >= 201103L
-namespace cwg1223 { // cwg1223: 17 drafting 2023-05-12
+namespace cwg1223 { // cwg1223: 17
 struct M;
 template 
 struct V;
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index ff625a4a985bc..40e2b25eedde0 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx20 %s
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx20,since-cxx23 %s
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,cxx11-23,since-cxx20 %s
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,cxx11-23,since-cxx20,since-cxx23 %s
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx20,since-cxx23,since-cxx26 %s
 
 
@@ -47,12 +47,17 @@ void f() {
 #endif
 } // namespace cwg2813
 
-namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
-
-#if __cpp_constexpr >= 202306L
+namespace cwg2819 { // cwg2819: 19 c++26
+#if __cplusplus >= 201103L
+  // CWG 2024-04-19: This issue is not a DR.
   constexpr void* p = nullptr;
-  constexpr int* q = static_cast(p);
-  static_assert(q == nullptr);
+  constexpr int* q = static_cast(p); // #cwg2819-q
+  // cxx11-23-error@-1 {{constexpr variable 'q' must be initialized by a constant expression}}
+  //   cxx11-23-note@-2 {{cast from 'void *' is not allowed in a constant expression}}
+  static_assert(q == nullptr, "");
+  // cxx11-23-error@-1 {{static assertion expression is not an integral constant expression}}
+  //   cxx11-23-note@-2 {{initializer of 'q' is not a constant expression}}
+  //   cxx11-23-note@#cwg2819-q {{declared here}}
 #endif
 }
 
diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp
index 9629bdd41a2a5..2aa52ad98ada8 100644
--- a/clang/test/CXX/drs/cwg29xx.cpp
+++ b/clang/test/CXX/drs/cwg29xx.cpp
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected %s
 // RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected %s
 
-namespace cwg2913 { // cwg2913: 20 tentatively ready 2024-08-16
+namespace cwg2913 { // cwg2913: 20
 
 #if __cplusplus >= 202002L
 
@@ -26,7 +26,7 @@ R(T, T) requires true -> R; // expected-error {{expected function body after
 
 } // namespace cwg2913
 
-namespace cwg2915 { // cwg2915: 20 tentatively ready 2024-08-16
+namespace cwg2915 { // cwg2915: 20
 #if __cplusplus >= 202302L
 struct A {
   void f(this void); // expected-error {{explicit object parameter cannot have 'void' type}}
@@ -61,7 +61,7 @@ void *operator new(std::size_t, void *p) { return p; }
 void* operator new[] (std::size_t, void* p) {return p;}
 
 
-namespace cwg2922 { // cwg2922: 20 tentatively ready 2024-07-10
+namespace cwg2922 { // cwg2922: 20
 union U { int a, b; };
 constexpr U nondeterministic(bool i) {
   if(i) {
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 386c57250b7db..c069e155fd547 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1442,7 +1442,7 @@ 

C++ defect report implementation status

233 - DR + DRWP References vs pointers in UDC overload resolution Unknown @@ -7153,15 +7153,11 @@

C++ defect report implementation status

Unnecessary restriction on auto array types Unknown - + 1223 - drafting + DRWP Syntactic disambiguation and trailing-return-types - -
- Not resolved - Clang 17 implements 2023-05-12 resolution -
+ Clang 17 1224 @@ -8945,11 +8941,11 @@

C++ defect report implementation status

Alias template specialization vs pack expansion Unknown - + 1521 - drafting + dup T{expr} with reference types - Not resolved + Unknown 1522 @@ -11545,11 +11541,11 @@

C++ defect report implementation status

Constant expressions and library undefined behavior Unknown - + 1953 - open + DR Data races and common initial sequence - Not resolved + Unknown 1954 @@ -11619,7 +11615,7 @@

C++ defect report implementation status

1965 - drafting + open Explicit casts to reference types Not resolved @@ -12693,7 +12689,7 @@

C++ defect report implementation status

2144 - DR + DRWP Function/variable declaration ambiguity Unknown @@ -13525,11 +13521,11 @@

C++ defect report implementation status

Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions Unknown - + 2283 - drafting + DR Missing complete type requirements - Not resolved + Unknown 2284 @@ -15183,7 +15179,7 @@

C++ defect report implementation status

2557 - drafting + review Class member access referring to an unrelated class Not resolved @@ -15207,7 +15203,7 @@

C++ defect report implementation status

2561 - DR + DRWP Conversion to function pointer for lambda with explicit object parameter No @@ -15373,7 +15369,7 @@

C++ defect report implementation status

2588 - DR + DRWP friend declarations and module linkage Unknown @@ -16213,7 +16209,7 @@

C++ defect report implementation status

2728 - DR + DRWP Evaluation of conversions in a delete-expression Unknown @@ -16734,11 +16730,11 @@

C++ defect report implementation status

Alignment requirement of incomplete class type Unknown - + 2815 - tentatively ready + DR Overload resolution for references/pointers to noexcept functions - Not resolved + Unknown 2816 @@ -16754,15 +16750,15 @@

C++ defect report implementation status

2818 - DR + DRWP Use of predefined reserved identifiers Unknown 2819 - accepted + WP Cast from null pointer value in a constant expression - Clang 19 + Clang 19 (C++26 onwards) 2820 @@ -16862,7 +16858,7 @@

C++ defect report implementation status

2836 - DR + DRWP Conversion rank of long double and extended floating-point types Unknown @@ -16904,7 +16900,7 @@

C++ defect report implementation status

2843 - review + drafting Undated reference to Unicode makes C++ a moving target Not resolved @@ -16998,13 +16994,13 @@

C++ defect report implementation status

2858 - accepted + WP Declarative nested-name-specifiers and pack-index-specifiers Clang 19 2859 - DR + DRWP Value-initialization with multiple default constructors Unknown @@ -17016,7 +17012,7 @@

C++ defect report implementation status

2861 - DR + DRWP dynamic_cast on bad pointer value Unknown @@ -17034,13 +17030,13 @@

C++ defect report implementation status

2864 - DR + DRWP Narrowing floating-point conversions Unknown 2865 - DR + DRWP Regression on result of conditional operator Unknown @@ -17052,7 +17048,7 @@

C++ defect report implementation status

2867 - DR + DRWP Order of initialization for structured bindings Unknown @@ -17064,25 +17060,25 @@

C++ defect report implementation status

2869 - DR + DRWP this in local classes Unknown 2870 - DR + DRWP Combining absent encoding-prefixes Unknown 2871 - DR + DRWP User-declared constructor templates inhibiting default constructors Unknown 2872 - DR + DRWP Linkage and unclear "can be referred to" Unknown @@ -17094,7 +17090,7 @@

C++ defect report implementation status

2874 - DR + DRWP Qualified declarations of partial specializations Unknown @@ -17106,13 +17102,13 @@

C++ defect report implementation status

2876 - accepted + WP Disambiguation of T x = delete("text") Unknown 2877 - DR + DRWP Type-only lookup for using-enum-declarator Clang 19 @@ -17122,33 +17118,33 @@

C++ defect report implementation status

C-style casts to reference types Not resolved - + 2879 - review + DR Undesired outcomes with const_cast - Not resolved + Unknown 2880 - accepted + WP Accessibility check for destructor of incomplete class type Unknown 2881 - DR + DRWP Type restrictions for the explicit object parameter of a lambda Clang 19 2882 - DR + DRWP Unclear treatment of conversion to void Clang 2.7 2883 - DR + DRWP Definition of "odr-usable" ignores lambda scopes No @@ -17170,13 +17166,13 @@

C++ defect report implementation status

2886 - DR + DRWP Temporaries and trivial potentially-throwing special member functions Clang 9 2887 - DR + DRWP Missing compatibility entries for xvalues Unknown @@ -17192,21 +17188,21 @@

C++ defect report implementation status

Requiring an accessible destructor for destroying operator delete Not resolved - + 2890 - review + DR Defining members of local classes - Not resolved + Unknown 2891 - DR + DRWP Normative status of implementation limits Unknown 2892 - DR + DRWP Unclear usual arithmetic conversions Unknown @@ -17216,15 +17212,15 @@

C++ defect report implementation status

Instantiations in discarded if constexpr substatements Unknown - + 2894 - review + DR Functional casts create prvalues of reference type - Not resolved + Unknown 2895 - DR + DRWP Initialization should ignore the destination type's cv-qualification Unknown @@ -17246,11 +17242,11 @@

C++ defect report implementation status

Clarify implicit conversion sequence from cv T to T Not resolved - + 2899 - tentatively ready + DR Bad value representations should cause undefined behavior - Not resolved + Unknown 2900 @@ -17258,11 +17254,11 @@

C++ defect report implementation status

Deduction of non-type template arguments with placeholder types Not resolved - + 2901 - tentatively ready + DR Unclear semantics for near-match aliased access - Not resolved + Unknown 2902 @@ -17272,7 +17268,7 @@

C++ defect report implementation status

2903 - tentatively ready + drafting Can we omit the template disambiguator in nested-name-specifiers in type-only contexts? Not resolved @@ -17282,47 +17278,47 @@

C++ defect report implementation status

Introducing template-names Not resolved - + 2905 - tentatively ready + DR Value-dependence of noexcept-expression - Not resolved + Unknown - + 2906 - tentatively ready + DR Lvalue-to-rvalue conversion of class types for conditional operator - Not resolved + Unknown - + 2907 - tentatively ready + DR Constant lvalue-to-rvalue conversion on uninitialized std::nullptr_t - Not resolved + Unknown - + 2908 - tentatively ready + DR Counting physical source lines for __LINE__ - Not resolved + Unknown - + 2909 - review + DR Subtle difference between constant-initialized and constexpr - Not resolved + Unknown - + 2910 - tentatively ready + DR Effect of requirement-parameter-lists on odr-usability - Not resolved + Unknown - + 2911 - tentatively ready + DR Unclear meaning of expressions "appearing within" subexpressions - Not resolved + Unknown 2912 @@ -17330,15 +17326,11 @@

C++ defect report implementation status

Too-large value for size in array new Not resolved - + 2913 - tentatively ready + DR Grammar for deduction-guide has requires-clause in the wrong position - -
- Not resolved - Clang 20 implements 2024-08-16 resolution -
+ Clang 20 2914 @@ -17346,15 +17338,11 @@

C++ defect report implementation status

Unclear order of initialization of static and thread-local variables Not resolved - + 2915 - tentatively ready + DR Explicit object parameters of type void - -
- Not resolved - Clang 20 implements 2024-08-16 resolution -
+ Clang 20 2916 @@ -17372,17 +17360,17 @@

C++ defect report implementation status

Clang 20 implements 2024-07-30 resolution - + 2918 - tentatively ready + DR Consideration of constraints for address of overloaded function - Not resolved + Unknown - + 2919 - tentatively ready + DR Conversion function candidates for initialization of const lvalue reference - Not resolved + Unknown 2920 @@ -17390,33 +17378,29 @@

C++ defect report implementation status

The template keyword for base classes Not resolved - + 2921 - tentatively ready + DR Exporting redeclarations of entities not attached to a named module - Not resolved + Unknown - + 2922 - tentatively ready + DR constexpr placement-new is too permissive - -
- Not resolved - Clang 20 implements 2024-07-10 resolution -
+ Clang 20 2923 - tentatively ready + review Note about infinite loops and execution steps Not resolved - + 2924 - review + DR Undefined behavior during constant evaluation - Not resolved + Unknown 2925 @@ -17426,15 +17410,15 @@

C++ defect report implementation status

2926 - open + drafting Lookup context for dependent qualified names Not resolved - + 2927 - tentatively ready + DR Unclear status of translation unit with module keyword - Not resolved + Unknown 2928 @@ -17444,21 +17428,21 @@

C++ defect report implementation status

2929 - tentatively ready + review Lifetime of trivially-destructible static or thread-local objects Not resolved - + 2930 - tentatively ready + DR Unclear term "copy/move operation" in specification of copy elision - Not resolved + Unknown - + 2931 - tentatively ready + DR Restrictions on operator functions that are explicit object member functions - Not resolved + Unknown 2932 @@ -17466,11 +17450,11 @@

C++ defect report implementation status

Value range of empty enumeration Not resolved - + 2933 - open + DR Dangling references - Not resolved + Unknown 2934 @@ -17484,17 +17468,17 @@

C++ defect report implementation status

Destroying the coroutine state when initial-await-resume-called is false Not resolved - + 2936 - open + DR Local classes of templated functions should be part of the current instantiation - Not resolved + Unknown - + 2937 - open + DR Grammar for preprocessing-file has no normative effect - Not resolved + Unknown 2938 @@ -17502,10 +17486,268 @@

C++ defect report implementation status

Inheriting linkage from a previous declaration Not resolved - + 2939 - open + DR Do not allow reinterpret_cast from prvalue to rvalue reference + Unknown + + + 2940 + review + Definition of "object" + Not resolved + + + 2941 + open + Lifetime extension for function-style cast to reference type + Not resolved + + + 2942 + open + Packs in a function's parameter-type-list + Not resolved + + + 2943 + open + Discarding a void return value + Not resolved + + + 2944 + DR + Unsequenced throw-expressions + Unknown + + + 2945 + open + Redundant constraints on matching function template declarations + Not resolved + + + 2946 + open + Dependent call equivalence in non-ADL cases + Not resolved + + + 2947 + open + Limiting macro expansion in pp-module + Not resolved + + + 2948 + open + Late ambiguity for partial template specialization + Not resolved + + + 2949 + open + Treatment of ellipsis during partial ordering + Not resolved + + + 2950 + open + Value preservation in enumeration vs. integer bit-fields + Not resolved + + + 2951 + open + Distinguishing a primary template + Not resolved + + + 2952 + open + Vacuous initialization for subobjects + Not resolved + + + 2953 + open + Value representation for non-trivially-copyable types + Not resolved + + + 2954 + NAD + Simultaneous modifications of an atomic object + Unknown + + + 2955 + open + Unify rules about conflicting unordered accesses + Not resolved + + + 2956 + open + Missing allowance for pseudo-destructors in qualified lookup + Not resolved + + + 2957 + open + Evaluating a reference member should constitute access + Not resolved + + + 2958 + open + Overload resolution involving lvalue transformation and qualification conversion + Not resolved + + + 2959 + open + Naming enumerators in class member access expressions + Not resolved + + + 2960 + open + Introduce discontiguous object lifetime + Not resolved + + + 2961 + open + Checking of ill-formed types in constraint-expressions + Not resolved + + + 2962 + open + Evaluation of destructor call for variable with constant destruction + Not resolved + + + 2963 + open + Paradoxical variable-or-function declaration + Not resolved + + + 2964 + open + Reading "invalid pointer values" + Not resolved + + + 2965 + open + Generic lambdas do not have a template parameter scope + Not resolved + + + 2966 + open + Alignment and value representation of std::nullptr_t + Not resolved + + + 2967 + open + Explicit conversion functions + Not resolved + + + 2968 + open + Name lookup result for typedef-name vs. class-name + Not resolved + + + 2969 + open + Scopes in the function-try-block of a constructor + Not resolved + + + 2970 + open + Races with volatile sig_atomic_t bit-fields + Not resolved + + + 2971 + open + Specializations for a class are not decl-reachable + Not resolved + + + 2972 + open + Declarative nested-name-specifier naming a partial specialization + Not resolved + + + 2973 + open + Does an alias-declaration introduce a name for linkage purposes? + Not resolved + + + 2974 + open + Non-deduced context for qualified-id naming a template + Not resolved + + + 2975 + open + Effect of concept template-head on parameter mappings + Not resolved + + + 2976 + open + Transferring control out of a function + Not resolved + + + 2977 + open + Initialization with string literals + Not resolved + + + 2978 + open + Deduction involving reference to similar types + Not resolved + + + 2979 + open + Duplicate declarations of enumerations in class scope + Not resolved + + + 2980 + open + Constraints on template template parameters + Not resolved + + + 2981 + open + Usual arithmetic conversions and result types + Not resolved + + + 2982 + open + Deduction in type-constraints Not resolved diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status index f9a35c61c12de..e0885fdbd2d3c 100755 --- a/clang/www/make_cxx_dr_status +++ b/clang/www/make_cxx_dr_status @@ -169,6 +169,12 @@ def availability(issue): elif status.endswith(' c++20'): status = status[:-6] avail_suffix = ' (C++20 onwards)' + elif status.endswith(' c++23'): + status = status[:-6] + avail_suffix = ' (C++23 onwards)' + elif status.endswith(' c++26'): + status = status[:-6] + avail_suffix = ' (C++26 onwards)' if status == 'unknown': avail = 'Unknown' avail_style = 'unknown' From 4a7c0b8afe6bf616cd6bb4f13b5b706a43c10e74 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 4 Jan 2025 15:09:03 +0000 Subject: [PATCH 154/480] [LV] Add X86-specific induction step tests. Adds additional test coverage for induction codegen. --- .../LoopVectorize/X86/induction-step.ll | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/induction-step.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll new file mode 100644 index 0000000000000..6aac11a579719 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { +; CHECK-LABEL: @wide_add_induction_step_live_in( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[O_1:%.*]] = add i16 [[OFF:%.*]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4 +; CHECK-NEXT: store <4 x i16> [[TMP4]], ptr [[TMP6]], align 2 +; CHECK-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i16 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ADD]] = add i16 [[IV_2]], [[O_1]] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i16 [[ADD]], ptr [[GEP_DST]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[LOOP]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i16 [[ADD_LCSSA]] +; +entry: + %o.1 = add i16 %off, 2 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ] + %add = add i16 %iv.2, %o.1 + %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv + store i16 %add, ptr %gep.dst, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec , label %exit, label %loop + +exit: + ret i16 %add +} + +define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { +; CHECK-LABEL: @wide_sub_induction_step_live_in( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[O_1:%.*]] = add i16 [[OFF:%.*]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = sub i16 -2, [[OFF]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4 +; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2 +; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[TMP9]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[SUB:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SUB]] = sub i16 [[IV_2]], [[O_1]] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i16 [[SUB]], ptr [[GEP_DST]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i16 [[SUB_LCSSA]] +; +entry: + %o.1 = add i16 %off, 2 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ] + %sub = sub i16 %iv.2, %o.1 + %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv + store i16 %sub, ptr %gep.dst, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec , label %exit, label %loop + +exit: + ret i16 %sub +} From 24c2ba07ce65a5bf7d1113e05c517169d950b663 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Sun, 5 Jan 2025 01:20:54 +0800 Subject: [PATCH 155/480] [InstCombine] Drop NSW when converting `shl X, BW - 1` back into mul (#121633) `X < bool { + auto MatchShiftOrMulXC = [](Value *Op, Value *&V, APInt &C, + bool &PreserveNSW) -> bool { const APInt *Tmp = nullptr; if ((!V && match(Op, m_Mul(m_Value(V), m_APInt(Tmp)))) || (V && match(Op, m_Mul(m_Specific(V), m_APInt(Tmp))))) C = *Tmp; else if ((!V && match(Op, m_Shl(m_Value(V), m_APInt(Tmp)))) || - (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) + (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) { C = APInt(Tmp->getBitWidth(), 1) << *Tmp; + // We cannot preserve NSW when shifting by BW - 1. + PreserveNSW = Tmp->ult(Tmp->getBitWidth() - 1); + } if (Tmp != nullptr) return true; @@ -2095,7 +2099,9 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, return false; }; - if (MatchShiftOrMulXC(Op0, X, Y) && MatchShiftOrMulXC(Op1, X, Z)) { + bool Op0PreserveNSW = true, Op1PreserveNSW = true; + if (MatchShiftOrMulXC(Op0, X, Y, Op0PreserveNSW) && + MatchShiftOrMulXC(Op1, X, Z, Op1PreserveNSW)) { // pass } else if (MatchShiftCX(Op0, Y, X) && MatchShiftCX(Op1, Z, X)) { ShiftByX = true; @@ -2108,7 +2114,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, OverflowingBinaryOperator *BO0 = cast(Op0); // TODO: We may be able to deduce more about nsw/nuw of BO0/BO1 based on Y >= // Z or Z >= Y. - bool BO0HasNSW = BO0->hasNoSignedWrap(); + bool BO0HasNSW = Op0PreserveNSW && BO0->hasNoSignedWrap(); bool BO0HasNUW = BO0->hasNoUnsignedWrap(); bool BO0NoWrap = IsSRem ? BO0HasNSW : BO0HasNUW; @@ -2131,7 +2137,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, }; OverflowingBinaryOperator *BO1 = cast(Op1); - bool BO1HasNSW = BO1->hasNoSignedWrap(); + bool BO1HasNSW = Op1PreserveNSW && BO1->hasNoSignedWrap(); bool BO1HasNUW = BO1->hasNoUnsignedWrap(); bool BO1NoWrap = IsSRem ? BO1HasNSW : BO1HasNUW; // (rem (mul X, Y), (mul nuw/nsw X, Z)) diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll index e7d6cc7102c71..920497c07e380 100644 --- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll +++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll @@ -372,6 +372,32 @@ define <2 x i8> @srem_XY_XZ_with_CY_gt_CZ_no_nuw_out(<2 x i8> %X) { ret <2 x i8> %r } +define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(i8 noundef %X) { +; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw( +; CHECK-NEXT: [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127 +; CHECK-NEXT: [[BO1:%.*]] = shl nsw i8 [[X]], 7 +; CHECK-NEXT: [[R:%.*]] = srem i8 [[BO1]], [[BO0]] +; CHECK-NEXT: ret i8 [[R]] +; + %BO0 = mul nsw i8 %X, 127 + %BO1 = shl nsw i8 %X, 7 + %r = srem i8 %BO1, %BO0 + ret i8 %r +} + +define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(i8 noundef %X) { +; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted( +; CHECK-NEXT: [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127 +; CHECK-NEXT: [[BO1:%.*]] = shl nsw i8 [[X]], 7 +; CHECK-NEXT: [[R:%.*]] = srem i8 [[BO0]], [[BO1]] +; CHECK-NEXT: ret i8 [[R]] +; + %BO0 = mul nsw i8 %X, 127 + %BO1 = shl nsw i8 %X, 7 + %r = srem i8 %BO0, %BO1 + ret i8 %r +} + define i8 @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(i8 %X) { ; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1( ; CHECK-NEXT: [[BO0:%.*]] = mul nuw nsw i8 [[X:%.*]], 10 From ce6251540d7af30585d4ca753ca2a0ab34d32be2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 4 Jan 2025 16:29:30 +0000 Subject: [PATCH 156/480] [X86] vector overflow tests - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/vec_saddo.ll | 6 +++--- llvm/test/CodeGen/X86/vec_ssubo.ll | 6 +++--- llvm/test/CodeGen/X86/vec_uaddo.ll | 6 +++--- llvm/test/CodeGen/X86/vec_usubo.ll | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 460c5fe11f82a..78dd2cf783ef8 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -517,7 +517,7 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -647,7 +647,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -993,7 +993,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d06993da6365d..746c09e5e70db 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -522,7 +522,7 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -652,7 +652,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1010,7 +1010,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index bac118095331c..be7888cd76a6b 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -604,7 +604,7 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltud %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -730,7 +730,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1046,7 +1046,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index ab75ada72f256..ceb1ad13bc153 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -647,7 +647,7 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -773,7 +773,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1093,7 +1093,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx From 9a95c097d0466c594f40a4ba9ced8a155574fdff Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Sat, 4 Jan 2025 10:02:48 -0800 Subject: [PATCH 157/480] [libc] Remove some unused includes from headers under src/math/generic. (#121632) These were indicated by Clang include-cleaner. --- libc/src/math/generic/CMakeLists.txt | 3 --- libc/src/math/generic/exp10f_impl.h | 3 --- libc/src/math/generic/range_reduction_double_common.h | 1 - libc/src/math/generic/sincosf16_utils.h | 2 -- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 4 ---- 5 files changed, 13 deletions(-) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index b3d4612915197..382f5b362e2eb 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -358,7 +358,6 @@ add_header_library( HDRS sincosf16_utils.h DEPENDS - libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.nearest_integer libc.src.__support.common @@ -1702,8 +1701,6 @@ add_header_library( libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization libc.src.__support.common diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h index d741318382e1f..975fd01a0a25c 100644 --- a/libc/src/math/generic/exp10f_impl.h +++ b/libc/src/math/generic/exp10f_impl.h @@ -10,12 +10,9 @@ #define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H #include "explogxf.h" -#include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" #include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index bcab82f6c9c3a..06aeb49495ad2 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -9,7 +9,6 @@ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H #define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H -#include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/double_double.h" #include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/FPUtil/multiply_add.h" diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h index 5e5edd4a8c85b..87b1dde560c5e 100644 --- a/libc/src/math/generic/sincosf16_utils.h +++ b/libc/src/math/generic/sincosf16_utils.h @@ -9,9 +9,7 @@ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H #define LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H -#include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 91c7db9029a66..15fa4123b75fe 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1797,7 +1797,6 @@ libc_support_library( hdrs = ["src/math/generic/sincosf16_utils.h"], deps = [ ":__support_common", - ":__support_fputil_fp_bits", ":__support_fputil_nearest_integer", ":__support_fputil_polyeval", ], @@ -1846,11 +1845,8 @@ libc_support_library( name = "exp10f_impl", hdrs = ["src/math/generic/exp10f_impl.h"], deps = [ - ":__support_fputil_basic_operations", ":__support_fputil_fma", ":__support_fputil_multiply_add", - ":__support_fputil_nearest_integer", - ":__support_fputil_polyeval", ":__support_fputil_rounding_mode", ":__support_macros_optimization", ":common_constants", From c19e0d63b45f9c97157060c662396820ce2a1621 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 4 Jan 2025 10:56:45 -0800 Subject: [PATCH 158/480] [gcov,test] Update exit-block.ll now that exit block is always the second Follow-up to 82fecab85ae2d72ffac0e44749d99f12d6f71cc0 --- llvm/test/Transforms/GCOVProfiling/exit-block.ll | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll index 567e22222f580..1840f045b3ffe 100644 --- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll +++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll @@ -3,13 +3,9 @@ ; RUN: echo '!19 = !{!"%/t/exit-block.ll", !0}' > %t/1 ; RUN: cat %s %t/1 > %t/2 -; By default, the exit block is the second. +; The exit block is the second. ; RUN: opt -passes=insert-gcov-profiling -disable-output %t/2 -; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s - -; But we can optionally emit it last, to match GCC<4.8 (r189778). -; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2 -; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s +; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -66,10 +62,7 @@ attributes #2 = { nounwind } ; There should be no destination edges for the exit block. ; CHECK: Block : 1 Counter : 0 -; EXIT-LAST: Destination Edges -; EXIT-SECOND-NOT: Destination Edges ; CHECK: Block : 2 Counter : 0 ; CHECK: Block : 4 Counter : 0 -; EXIT-LAST-NOT: Destination Edges -; EXIT-SECOND: Destination Edges +; CHECK: Destination Edges ; CHECK-NOT: Block : From b95cce99049d6b79c418c9981dc39ede2850994e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 31 Dec 2024 12:08:43 +0000 Subject: [PATCH 159/480] [VPlan] Update wide induction inc recipes to use same step as Wide IV. Update wide induction increments to use the same step as the corresponding wide induction. This enables detecting induction increments directly in VPlan and removes redundant splats. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 ++++++++++++++ .../Transforms/LoopVectorize/X86/induction-step.ll | 6 ++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7ef5295bb1276..5b75f6b26b6c5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9311,6 +9311,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + // Update wide induction increments to use the same step as the corresponding + // wide induction. This enables detecting induction increments directly in + // VPlan and removes redundant splats. + for (const auto &[Phi, ID] : Legal->getInductionVars()) { + auto *IVInc = cast( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) + continue; + VPWidenInductionRecipe *WideIV = + cast(RecipeBuilder.getRecipe(Phi)); + VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); + R->setOperand(1, WideIV->getStepValue()); + } + if (auto *UncountableExitingBlock = Legal->getUncountableEarlyExitingBlock()) { VPlanTransforms::handleUncountableEarlyExit( diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll index 6aac11a579719..f6a9767c7f87d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll @@ -21,16 +21,14 @@ define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> , [[DOTSPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4 From 1fa0036226d0ffad624bfb43595d00885db546b9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Jan 2025 11:49:12 -0800 Subject: [PATCH 160/480] [AArch64] Remove one unnecesssary include of AArch64GenSystemOperands.inc. NFC GET_PRCTX_DECL and GET_PRCTX_IMPl don't exist in AArch64GenSystemOperands.inc so this include does nothing. It looks like it was removed in 2050e7ebe18cc4cf906d9b54d17ee885cd868327. --- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp | 7 ------- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h | 8 -------- 2 files changed, 15 deletions(-) diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index d83c22e717950..49ce0a58f4167 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -66,13 +66,6 @@ namespace llvm { } } -namespace llvm { - namespace AArch64PRCTX { -#define GET_PRCTX_IMPL -#include "AArch64GenSystemOperands.inc" - } -} - namespace llvm { namespace AArch64PRFM { #define GET_PRFM_IMPL diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e0ccba4d6a59e..e7db9077b6439 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -731,14 +731,6 @@ namespace AArch64TLBI { #include "AArch64GenSystemOperands.inc" } -namespace AArch64PRCTX { - struct PRCTX : SysAliasReg { - using SysAliasReg::SysAliasReg; - }; - #define GET_PRCTX_DECL - #include "AArch64GenSystemOperands.inc" -} - namespace AArch64II { /// Target Operand Flag enum. enum TOF { From 59354a865fe408749634456e10bd76a50d785c2b Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 4 Jan 2025 20:15:05 +0000 Subject: [PATCH 161/480] [compiler-rt][rtsan] intercept fflush. (#121643) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 13 +++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 28 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 9f89ab6bf1fc7..f1fe20b255d9c 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -292,6 +292,18 @@ INTERCEPTOR(int, fputs, const char *s, FILE *stream) { return REAL(fputs)(s, stream); } +INTERCEPTOR(int, fflush, FILE *stream) { + __rtsan_notify_intercepted_call("fflush"); + return REAL(fflush)(stream); +} + +#if SANITIZER_APPLE +INTERCEPTOR(int, fpurge, FILE *stream) { + __rtsan_notify_intercepted_call("fpurge"); + return REAL(fpurge)(stream); +} +#endif + INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) { __rtsan_notify_intercepted_call("fdopen"); return REAL(fdopen)(fd, mode); @@ -981,6 +993,7 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_CREAT64; INTERCEPT_FUNCTION(puts); INTERCEPT_FUNCTION(fputs); + INTERCEPT_FUNCTION(fflush); INTERCEPT_FUNCTION(fdopen); INTERCEPT_FUNCTION(freopen); RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE; diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 5adbf0fb63de8..15dfc1af01625 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -604,6 +604,34 @@ TEST_F(RtsanOpenedFileTest, FputsDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +TEST_F(RtsanFileTest, FflushDiesWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + int written = fwrite("abc", 1, 3, f); + EXPECT_THAT(written, Eq(3)); + auto Func = [&f]() { + int res = fflush(f); + EXPECT_THAT(res, Eq(0)); + }; + ExpectRealtimeDeath(Func, "fflush"); + ExpectNonRealtimeSurvival(Func); +} + +#if SANITIZER_APPLE +TEST_F(RtsanFileTest, FpurgeDiesWhenRealtime) { + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + int written = fwrite("abc", 1, 3, f); + EXPECT_THAT(written, Eq(3)); + auto Func = [&f]() { + int res = fpurge(f); + EXPECT_THAT(res, Eq(0)); + }; + ExpectRealtimeDeath(Func, "fpurge"); + ExpectNonRealtimeSurvival(Func); +} +#endif + TEST_F(RtsanOpenedFileTest, ReadDiesWhenRealtime) { auto Func = [this]() { char c{}; From afef716e839bf7dd96ebce5264779b1d316db58e Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sat, 4 Jan 2025 21:28:59 +0100 Subject: [PATCH 162/480] [mlir][Transforms] Fix build after #116524 (part 2) (#121662) Since #116524, an integration test started to become flaky (failure rate ~15%). ``` bin/mlir-opt mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir --sparsifier="enable-arm-sve=true enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" | mlir-cpu-runner --march=aarch64 --mattr="+sve" -e main -entry-point-result=void -shared-libs=./lib/libmlir_runner_utils.so,./lib/libmlir_c_runner_utils.so | bin/FileCheck mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir # executed command: bin/mlir-opt mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir '--sparsifier=enable-arm-sve=true enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true' # .---command stderr------------ # | mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir:71:10: error: null operand found # | %0 = linalg.generic #trait_mul # | ^ # | mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir:71:10: note: see current operation: %70 = "arith.mulf"(<>, %69) <{fastmath = #arith.fastmath}> : (<>, vector<[2]xf64>) -> vector<[2]xf64> # `----------------------------- # error: command failed with exit status: 1 ``` I traced the issue back to the `DenseMap mapping;` data structure: previously, some `mapping.erase(foo)` calls were unsuccessful (returning `false`), even though the `mapping` contains `foo` as a key. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 0e577d2d39de3..48b8c727a7828 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -103,8 +103,8 @@ namespace { /// Helper class to make it possible to use `ValueVector` as a key in DenseMap. struct ValueVectorMapInfo { - static ValueVector getEmptyKey() { return ValueVector{}; } - static ValueVector getTombstoneKey() { return ValueVector{}; } + static ValueVector getEmptyKey() { return ValueVector{Value()}; } + static ValueVector getTombstoneKey() { return ValueVector{Value(), Value()}; } static ::llvm::hash_code getHashValue(const ValueVector &val) { return ::llvm::hash_combine_range(val.begin(), val.end()); } From fd38a95586477f8f60330ef723406d69b33b91f6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Jan 2025 12:31:31 -0800 Subject: [PATCH 163/480] [TargetParser] Use StringRef::split that takes a char separator instead of StringRef separator. NFC --- llvm/lib/TargetParser/Host.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 45b4cafc99598..9d1b7b8b0e7cd 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -173,7 +173,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line // in all cases. SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for the CPU implementer line. StringRef Implementer; @@ -436,7 +436,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) { // The "processor 0:" line comes after a fair amount of other information, // including a cache breakdown, but this should be plenty. SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for the CPU features. SmallVector CPUFeatures; @@ -478,7 +478,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) { StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) { // There are 24 lines in /proc/cpuinfo SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for uarch line to determine cpu name StringRef UArch; @@ -1630,7 +1630,7 @@ StringRef sys::getHostCPUName() { #if defined(__linux__) StringRef sys::detail::getHostCPUNameForSPARC(StringRef ProcCpuinfoContent) { SmallVector Lines; - ProcCpuinfoContent.split(Lines, "\n"); + ProcCpuinfoContent.split(Lines, '\n'); // Look for cpu line to determine cpu name StringRef Cpu; @@ -1970,7 +1970,7 @@ const StringMap sys::getHostCPUFeatures() { return Features; SmallVector Lines; - P->getBuffer().split(Lines, "\n"); + P->getBuffer().split(Lines, '\n'); SmallVector CPUFeatures; From c56b74315f57acb1b285ddc77b07031b773549b7 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Sun, 5 Jan 2025 00:11:24 +0300 Subject: [PATCH 164/480] [TableGen][GISel] Reuse `importNodeRenderer` for `OperandWithDefaultOps` (#121285) This avoids some code duplication (handling `Register`, `zero_reg` and immediate operands). --- .../GlobalISelEmitter/undef-tied-input.td | 17 +++- llvm/utils/TableGen/GlobalISelEmitter.cpp | 82 +++++++------------ 2 files changed, 43 insertions(+), 56 deletions(-) diff --git a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td index a2ee3dc311772..323aea9e396d1 100644 --- a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td +++ b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td @@ -1,14 +1,25 @@ -// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns \ +// RUN: -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s +// RUN: FileCheck -check-prefix=ERR %s < %t include "llvm/Target/Target.td" include "GlobalISelEmitterCommon.td" -def undef_tied : OperandWithDefaultOps { +def undef_tied_1 : OperandWithDefaultOps { let MIOperandInfo = (ops GPR32:$inactive); } +def undef_tied_2 : OperandWithDefaultOps { + let MIOperandInfo = (ops GPR32:$inactive); +} + +let Constraints = "$opt.inactive = $rd" in +def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_1:$opt), + [(set GPR32:$rd, (abs i32:$rs))]>; + +// ERR: [[#@LINE+2]]:5: warning: Skipped pattern: unsupported type let Constraints = "$opt.inactive = $rd" in -def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied:$opt), +def I2 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_2:$opt), [(set GPR32:$rd, (abs i32:$rs))]>; // CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$rs) => (I1:{ *:[i32] } i32:{ *:[i32] }:$rs) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 092cdd4ad5b43..9f6d3a506dceb 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -418,7 +418,8 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { const TreePatternNode &N) const; Error importLeafNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, - const TreePatternNode &N) const; + const TreePatternNode &N, + action_iterator InsertPt) const; Error importXFormNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const; @@ -431,9 +432,6 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter { const TreePatternNode &N, action_iterator &InsertPt) const; - Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M, - BuildMIAction &DstMIBuilder, - const DAGDefaultOperand &DefaultOp) const; Error importImplicitDefRenderers(BuildMIAction &DstMIBuilder, ArrayRef ImplicitDefs) const; @@ -1291,7 +1289,8 @@ Error GlobalISelEmitter::importNamedNodeRenderer( // Equivalent of MatcherGen::EmitResultLeafAsOperand. Error GlobalISelEmitter::importLeafNodeRenderer( - RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const { + RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N, + action_iterator InsertPt) const { if (const auto *II = dyn_cast(N.getLeafValue())) { MIBuilder.addRenderer(II->getValue()); return Error::success(); @@ -1300,11 +1299,29 @@ Error GlobalISelEmitter::importLeafNodeRenderer( if (const auto *DI = dyn_cast(N.getLeafValue())) { const Record *R = DI->getDef(); - if (R->isSubClassOf("Register")) { + if (R->isSubClassOf("Register") || R->getName() == "zero_reg") { MIBuilder.addRenderer(Target, R); return Error::success(); } + if (R->getName() == "undef_tied_input") { + std::optional OpTyOrNone = MVTToLLT(N.getSimpleType(0)); + if (!OpTyOrNone) + return failedImport("unsupported type"); + + unsigned TempRegID = M.allocateTempRegID(); + M.insertAction(InsertPt, *OpTyOrNone, TempRegID); + + auto I = M.insertAction( + InsertPt, M.allocateOutputInsnID(), + &Target.getInstruction(RK.getDef("IMPLICIT_DEF"))); + auto &ImpDefBuilder = static_cast(**I); + ImpDefBuilder.addRenderer(TempRegID, /*IsDef=*/true); + + MIBuilder.addRenderer(TempRegID); + return Error::success(); + } + if (R->isSubClassOf("SubRegIndex")) { const CodeGenSubRegIndex *SubRegIndex = CGRegs.getSubRegIdx(R); MIBuilder.addRenderer(SubRegIndex->EnumValue); @@ -1386,7 +1403,7 @@ Error GlobalISelEmitter::importNodeRenderer(RuleMatcher &M, return importNamedNodeRenderer(M, MIBuilder, N); if (N.isLeaf()) - return importLeafNodeRenderer(M, MIBuilder, N); + return importLeafNodeRenderer(M, MIBuilder, N, InsertPt); if (N.getOperator()->isSubClassOf("SDNodeXForm")) return importXFormNodeRenderer(M, MIBuilder, N); @@ -1707,11 +1724,11 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( // This is a predicate or optional def operand which the pattern has not // overridden, or which we aren't letting it override; emit the 'default // ops' operands. - - const Record *OperandNode = DstI->Operands[InstOpNo].Rec; - if (auto Error = importDefaultOperandRenderers( - InsertPt, M, DstMIBuilder, CGP.getDefaultOperand(OperandNode))) - return std::move(Error); + for (const TreePatternNode &OpNode : + make_pointee_range(CGP.getDefaultOperand(OperandNode).DefaultOps)) { + if (Error Err = importNodeRenderer(M, DstMIBuilder, OpNode, InsertPt)) + return Err; + } ++NumDefaultOps; continue; @@ -1734,47 +1751,6 @@ Expected GlobalISelEmitter::importExplicitUseRenderers( return InsertPt; } -Error GlobalISelEmitter::importDefaultOperandRenderers( - action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, - const DAGDefaultOperand &DefaultOp) const { - for (const auto &Op : DefaultOp.DefaultOps) { - const auto &N = *Op; - if (!N.isLeaf()) - return failedImport("Could not add default op"); - - const auto *DefaultOp = N.getLeafValue(); - - if (const DefInit *DefaultDefOp = dyn_cast(DefaultOp)) { - std::optional OpTyOrNone = MVTToLLT(N.getSimpleType(0)); - auto *Def = DefaultDefOp->getDef(); - if (Def->getName() == "undef_tied_input") { - unsigned TempRegID = M.allocateTempRegID(); - M.insertAction(InsertPt, *OpTyOrNone, - TempRegID); - InsertPt = M.insertAction( - InsertPt, M.allocateOutputInsnID(), - &Target.getInstruction(RK.getDef("IMPLICIT_DEF"))); - BuildMIAction &IDMIBuilder = - *static_cast(InsertPt->get()); - IDMIBuilder.addRenderer(TempRegID, /*IsDef=*/true); - DstMIBuilder.addRenderer(TempRegID); - } else { - DstMIBuilder.addRenderer(Target, Def); - } - continue; - } - - if (const IntInit *DefaultIntOp = dyn_cast(DefaultOp)) { - DstMIBuilder.addRenderer(DefaultIntOp->getValue()); - continue; - } - - return failedImport("Could not add default op"); - } - - return Error::success(); -} - Error GlobalISelEmitter::importImplicitDefRenderers( BuildMIAction &DstMIBuilder, ArrayRef ImplicitDefs) const { if (!ImplicitDefs.empty()) From f855ceeefc97220a052cc76a52a45c6907eac1f8 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Sat, 4 Jan 2025 13:04:35 -0600 Subject: [PATCH 165/480] [libc][NFC] use `__has_builtin` instead of checking macros. --- libc/shared/rpc_util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/shared/rpc_util.h b/libc/shared/rpc_util.h index 9406de59f63b7..687814b7ff2ae 100644 --- a/libc/shared/rpc_util.h +++ b/libc/shared/rpc_util.h @@ -152,10 +152,10 @@ template class optional { /// Suspend the thread briefly to assist the thread scheduler during busy loops. RPC_ATTRS void sleep_briefly() { -#if defined(__NVPTX__) && defined(RPC_TARGET_IS_GPU) +#if __has_builtin(__nvvm_reflect) if (__nvvm_reflect("__CUDA_ARCH") >= 700) asm("nanosleep.u32 64;" ::: "memory"); -#elif defined(__AMDGPU__) && defined(RPC_TARGET_IS_GPU) +#elif __has_builtin(__builtin_amdgcn_s_sleep) __builtin_amdgcn_s_sleep(2); #elif __has_builtin(__builtin_ia32_pause) __builtin_ia32_pause(); From d1d400372adc9ae78d8ee9c2387b2c6b062b0dc0 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 4 Jan 2025 21:46:08 +0000 Subject: [PATCH 166/480] [compiler-rt][rtsan] fix unit tests by sanitizer-aarch64-linux report. (#121666) --- compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 15dfc1af01625..d9872c54b2614 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -361,10 +361,10 @@ TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) { FILE *fp; size_t read; } fh = {f, 0}; - auto CookieRead = [this](void *cookie, char *buf, size_t size) { + auto CookieRead = [](void *cookie, char *buf, size_t size) { fholder *p = reinterpret_cast(cookie); p->read = fread(static_cast(buf), 1, size, p->fp); - EXPECT_NE(0, p->read); + EXPECT_NE(0u, p->read); }; cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr, nullptr, nullptr}; From 7db0a606a294bc788563b8363261efa0c13e3062 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sat, 4 Jan 2025 21:55:12 +0000 Subject: [PATCH 167/480] [objcopy][COFF] Do not strip .rdata section with --only-keep-debug (#121653) When not in MinGW mode, the PE debug directory is placed in .rdata by the linker instead of .buildid. In addition to .buildid always explicitly preserve the section containing the debug directory to avoid causing errors later in patchDebugDirectory. --- llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp | 10 ++- .../COFF/Inputs/i386-debug-rdata.yaml | 63 +++++++++++++++++++ .../COFF/only-keep-debug-rdata.test | 45 +++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml create mode 100644 llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test diff --git a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp index 782d5b2f70c3e..cebcb823e6895 100644 --- a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp @@ -183,10 +183,18 @@ static Error handleArgs(const CommonConfig &Config, }); if (Config.OnlyKeepDebug) { + const data_directory *DebugDir = + Obj.DataDirectories.size() > DEBUG_DIRECTORY + ? &Obj.DataDirectories[DEBUG_DIRECTORY] + : nullptr; // For --only-keep-debug, we keep all other sections, but remove their // content. The VirtualSize field in the section header is kept intact. - Obj.truncateSections([](const Section &Sec) { + Obj.truncateSections([DebugDir](const Section &Sec) { return !isDebugSection(Sec) && Sec.Name != ".buildid" && + !(DebugDir && DebugDir->Size > 0 && + DebugDir->RelativeVirtualAddress >= Sec.Header.VirtualAddress && + DebugDir->RelativeVirtualAddress < + Sec.Header.VirtualAddress + Sec.Header.SizeOfRawData) && ((Sec.Header.Characteristics & (IMAGE_SCN_CNT_CODE | IMAGE_SCN_CNT_INITIALIZED_DATA)) != 0); }); diff --git a/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml new file mode 100644 index 0000000000000..02a6e9db19c19 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml @@ -0,0 +1,63 @@ +--- !COFF +OptionalHeader: + AddressOfEntryPoint: 4096 + ImageBase: 268435456 + SectionAlignment: 4096 + FileAlignment: 512 + MajorOperatingSystemVersion: 6 + MinorOperatingSystemVersion: 0 + MajorImageVersion: 0 + MinorImageVersion: 0 + MajorSubsystemVersion: 6 + MinorSubsystemVersion: 0 + Subsystem: IMAGE_SUBSYSTEM_WINDOWS_CUI + DLLCharacteristics: [ ] + SizeOfStackReserve: 1048576 + SizeOfStackCommit: 4096 + SizeOfHeapReserve: 1048576 + SizeOfHeapCommit: 4096 + Debug: + RelativeVirtualAddress: 8192 + Size: 28 +header: + Machine: IMAGE_FILE_MACHINE_I386 + Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE, IMAGE_FILE_DLL ] +sections: + - Name: .text + Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 4096 + VirtualSize: 18 + SectionData: 5589E58B45108B450C8B450831C05DC20C00 + SizeOfRawData: 512 + - Name: .rdata + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ] + VirtualAddress: 8192 + VirtualSize: 109 + SectionData: 000000008D6978670000000002000000510000001C2000001C060000525344538B301061671ED0994C4C44205044422E010000002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D2F746573742E70646200 + SizeOfRawData: 512 + - Name: .debug_abbrev + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 12288 + VirtualSize: 78 + SectionData: 011101250E1305030E10171B0E110112060000022E011101120640186E0E030E3A0B3B0B2719360B49133F1900000305000218030E3A0B3B0B49130000042400030E3E0B0B0B0000050F00000000 + SizeOfRawData: 512 + - Name: .debug_info + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 16384 + VirtualSize: 116 + SectionData: 700000000400000000000401000000001D006E000000000000007500000000100010120000000200100010120000000155A5000000BC0000000101B16B00000003029108D70000000101720000000302910CD500000001016B00000003029110D30000000101720000000004CF00000005040500 + SizeOfRawData: 512 + - Name: .debug_line + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 20480 + VirtualSize: 60 + SectionData: 3800000004001E000000010101FB0E0D00010101010000000100000100746573742E6300000000000005020010001001053D0ABA060B2E0204000101 + SizeOfRawData: 512 + - Name: .debug_str + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + VirtualAddress: 24576 + VirtualSize: 217 + SectionData: 636C616E672076657273696F6E2032302E302E30676974202868747470733A2F2F6769746875622E636F6D2F62796C6177732F6C6C766D2D70726F6A6563742E67697420393963353263306236613662396366303765383365656265393364323831333635656165383732332900746573742E63002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D005F5F446C6C4D61696E43525453746172747570403132005F446C6C4D61696E4352545374617274757000696E7400630062006100 + SizeOfRawData: 512 +symbols: [] +... diff --git a/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test new file mode 100644 index 0000000000000..affd4b65009f4 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test @@ -0,0 +1,45 @@ +RUN: yaml2obj %p/Inputs/i386-debug-rdata.yaml -o %t.in.exe + +RUN: llvm-objcopy --only-keep-debug %t.in.exe %t.out.exe +RUN: llvm-readobj --sections %t.out.exe | FileCheck %s + +Check that all non-debug/rodata (which contains the debug directory in this case) +sections with IMAGE_SCN_CNT_CODE or IMAGE_SCN_CNT_INITIALIZED_DATA are truncated, +and no others. + +CHECK: Section { +CHECK-NEXT: Number: 1 +CHECK-NEXT: Name: .text (2E 74 65 78 74 00 00 00) +CHECK-NEXT: VirtualSize: 0x12 +CHECK-NEXT: VirtualAddress: 0x1000 +CHECK-NEXT: RawDataSize: 0 +CHECK: Section { +CHECK-NEXT: Number: 2 +CHECK-NEXT: Name: .rdata (2E 72 64 61 74 61 00 00) +CHECK-NEXT: VirtualSize: 0x6D +CHECK-NEXT: VirtualAddress: 0x2000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 3 +CHECK-NEXT: Name: .debug_abbrev (2F 34 00 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0x4E +CHECK-NEXT: VirtualAddress: 0x3000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 4 +CHECK-NEXT: Name: .debug_info (2F 32 39 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0x74 +CHECK-NEXT: VirtualAddress: 0x4000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 5 +CHECK-NEXT: Name: .debug_line (2F 34 31 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0x3C +CHECK-NEXT: VirtualAddress: 0x5000 +CHECK-NEXT: RawDataSize: 512 +CHECK: Section { +CHECK-NEXT: Number: 6 +CHECK-NEXT: Name: .debug_str (2F 31 38 00 00 00 00 00) +CHECK-NEXT: VirtualSize: 0xD9 +CHECK-NEXT: VirtualAddress: 0x6000 +CHECK-NEXT: RawDataSize: 512 From 7a761100960c0c9e2b2fa8a9ee233b137270bd73 Mon Sep 17 00:00:00 2001 From: Zhengxing li Date: Sat, 4 Jan 2025 14:02:39 -0800 Subject: [PATCH 168/480] [HLSL][SPIR-V] implement SV_GroupID semantic lowering (#121521) The HLSL SV_GroupID semantic attribute is lowered into @llvm.spv.group.id intrinsic in LLVM IR for SPIR-V target. In the SPIR-V backend, this is now translated to a `WorkgroupId` builtin variable. Fixes #118700 which's a follow-up work to #70120 --- clang/lib/CodeGen/CGHLSLRuntime.cpp | 2 +- clang/lib/CodeGen/CGHLSLRuntime.h | 1 + .../CodeGenHLSL/semantics/SV_GroupID.hlsl | 34 ++++++------ llvm/include/llvm/IR/IntrinsicsSPIRV.td | 1 + .../Target/SPIRV/SPIRVInstructionSelector.cpp | 8 +++ .../SPIRV/hlsl-intrinsics/SV_GroupID.ll | 52 +++++++++++++++++++ 6 files changed, 82 insertions(+), 16 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index c354e58e15f4b..5679bd7158179 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -395,7 +395,7 @@ llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B, return buildVectorInput(B, GroupThreadIDIntrinsic, Ty); } if (D.hasAttr()) { - llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(Intrinsic::dx_group_id); + llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(getGroupIdIntrinsic()); return buildVectorInput(B, GroupIDIntrinsic, Ty); } assert(false && "Unhandled parameter attribute"); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index edb87f9d5efdf..3d5724118611c 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -87,6 +87,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(Radians, radians) GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id) GENERATE_HLSL_INTRINSIC_FUNCTION(GroupThreadId, thread_id_in_group) + GENERATE_HLSL_INTRINSIC_FUNCTION(GroupId, group_id) GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot) GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot) GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot) diff --git a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl index 5e09f0fe06d4e..3aa054afc9045 100644 --- a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl +++ b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl @@ -1,32 +1,36 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv -// Make sure SV_GroupID translated into dx.group.id. +// Make sure SV_GroupID translated into dx.group.id for directx target and spv.group.id for spirv target. -// CHECK: define void @foo() -// CHECK: %[[#ID:]] = call i32 @llvm.dx.group.id(i32 0) -// CHECK: call void @{{.*}}foo{{.*}}(i32 %[[#ID]]) +// CHECK: define void @foo() +// CHECK: %[[#ID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(i32 %[[#ID]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(i32 %[[#ID]]) [shader("compute")] [numthreads(8,8,1)] void foo(uint Idx : SV_GroupID) {} -// CHECK: define void @bar() -// CHECK: %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0) -// CHECK: %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0 -// CHECK: %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1) -// CHECK: %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1 -// CHECK: call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]]) +// CHECK: define void @bar() +// CHECK: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) +// CHECK: %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0 +// CHECK: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].group.id(i32 1) +// CHECK: %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1 +// CHECK-DXIL: call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]]) +// CHECK-SPIRV: call spir_func void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]]) [shader("compute")] [numthreads(8,8,1)] void bar(uint2 Idx : SV_GroupID) {} // CHECK: define void @test() -// CHECK: %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0) +// CHECK: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) // CHECK: %[[#ID_X_:]] = insertelement <3 x i32> poison, i32 %[[#ID_X]], i64 0 -// CHECK: %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1) +// CHECK: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].group.id(i32 1) // CHECK: %[[#ID_XY:]] = insertelement <3 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1 -// CHECK: %[[#ID_Z:]] = call i32 @llvm.dx.group.id(i32 2) +// CHECK: %[[#ID_Z:]] = call i32 @llvm.[[TARGET]].group.id(i32 2) // CHECK: %[[#ID_XYZ:]] = insertelement <3 x i32> %[[#ID_XY]], i32 %[[#ID_Z]], i64 2 -// CHECK: call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]]) +// CHECK-DXIL: call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]]) +// CHECK-SPIRV: call spir_func void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]]) [shader("compute")] [numthreads(8,8,1)] void test(uint3 Idx : SV_GroupID) {} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index bcff0f20b985d..8ebce408ff138 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -59,6 +59,7 @@ let TargetPrefix = "spv" in { // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support. def int_spv_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>; + def int_spv_group_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>; def int_spv_thread_id_in_group : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>; def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>; def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 289d5f3166487..0fa0986a10c69 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -2881,6 +2881,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, // translated to a `LocalInvocationId` builtin variable return loadVec3BuiltinInputID(SPIRV::BuiltIn::LocalInvocationId, ResVReg, ResType, I); + case Intrinsic::spv_group_id: + // The HLSL SV_GroupId semantic is lowered to + // llvm.spv.group.id intrinsic in LLVM IR for SPIR-V backend. + // + // In SPIR-V backend, llvm.spv.group.id is now translated to a `WorkgroupId` + // builtin variable + return loadVec3BuiltinInputID(SPIRV::BuiltIn::WorkgroupId, ResVReg, ResType, + I); case Intrinsic::spv_fdot: return selectFloatDot(ResVReg, ResType, I); case Intrinsic::spv_udot: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll new file mode 100644 index 0000000000000..92947f7865ced --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll @@ -0,0 +1,52 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#int:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#v3int:]] = OpTypeVector %[[#int]] 3 +; CHECK-DAG: %[[#ptr_Input_v3int:]] = OpTypePointer Input %[[#v3int]] +; CHECK-DAG: %[[#tempvar:]] = OpUndef %[[#v3int]] +; CHECK-DAG: %[[#WorkgroupId:]] = OpVariable %[[#ptr_Input_v3int]] Input + +; CHECK-DAG: OpEntryPoint GLCompute {{.*}} %[[#WorkgroupId]] +; CHECK-DAG: OpName %[[#WorkgroupId]] "__spirv_BuiltInWorkgroupId" +; CHECK-DAG: OpDecorate %[[#WorkgroupId]] LinkageAttributes "__spirv_BuiltInWorkgroupId" Import +; CHECK-DAG: OpDecorate %[[#WorkgroupId]] BuiltIn WorkgroupId + +target triple = "spirv-unknown-vulkan-library" + +declare void @group_id_user(<3 x i32>) + +; Function Attrs: convergent noinline norecurse +define void @main() #1 { +entry: + +; CHECK: %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]] +; CHECK: %[[#load0:]] = OpCompositeExtract %[[#int]] %[[#load]] 0 + %1 = call i32 @llvm.spv.group.id(i32 0) + +; CHECK: %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load0]] %[[#tempvar]] + %2 = insertelement <3 x i32> poison, i32 %1, i64 0 + +; CHECK: %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]] +; CHECK: %[[#load1:]] = OpCompositeExtract %[[#int]] %[[#load]] 1 + %3 = call i32 @llvm.spv.group.id(i32 1) + +; CHECK: %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load1]] %[[#tempvar]] 1 + %4 = insertelement <3 x i32> %2, i32 %3, i64 1 + +; CHECK: %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]] +; CHECK: %[[#load2:]] = OpCompositeExtract %[[#int]] %[[#load]] 2 + %5 = call i32 @llvm.spv.group.id(i32 2) + +; CHECK: %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load2]] %[[#tempvar]] 2 + %6 = insertelement <3 x i32> %4, i32 %5, i64 2 + + call spir_func void @group_id_user(<3 x i32> %6) + ret void +} + +; Function Attrs: nounwind willreturn memory(none) +declare i32 @llvm.spv.group.id(i32) #3 + +attributes #1 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #3 = { nounwind willreturn memory(none) } From a738d81cd2822698539b0482af48d49d91ea5a2e Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sat, 4 Jan 2025 23:10:41 +0100 Subject: [PATCH 169/480] [libc] Improve qsort (with build fix) (#121482) --- libc/fuzzing/stdlib/CMakeLists.txt | 6 +- libc/fuzzing/stdlib/heap_sort_fuzz.cpp | 29 ++- .../{qsort_fuzz.cpp => quick_sort_fuzz.cpp} | 29 ++- libc/src/stdlib/heap_sort.h | 12 +- libc/src/stdlib/qsort.cpp | 10 +- libc/src/stdlib/qsort_data.h | 171 +++++++++------ libc/src/stdlib/qsort_pivot.h | 85 ++++++++ libc/src/stdlib/qsort_r.cpp | 11 +- libc/src/stdlib/qsort_util.h | 47 +++- libc/src/stdlib/quick_sort.h | 203 +++++++++++++----- libc/test/src/stdlib/CMakeLists.txt | 18 +- libc/test/src/stdlib/SortingTest.h | 199 +++++++++-------- libc/test/src/stdlib/heap_sort_test.cpp | 18 +- libc/test/src/stdlib/qsort_r_test.cpp | 4 +- libc/test/src/stdlib/qsort_test.cpp | 17 -- libc/test/src/stdlib/quick_sort_test.cpp | 19 +- .../libc/test/src/stdlib/BUILD.bazel | 16 +- 17 files changed, 569 insertions(+), 325 deletions(-) rename libc/fuzzing/stdlib/{qsort_fuzz.cpp => quick_sort_fuzz.cpp} (62%) create mode 100644 libc/src/stdlib/qsort_pivot.h delete mode 100644 libc/test/src/stdlib/qsort_test.cpp diff --git a/libc/fuzzing/stdlib/CMakeLists.txt b/libc/fuzzing/stdlib/CMakeLists.txt index 9b3298cfc55a7..3dbd640a67dbd 100644 --- a/libc/fuzzing/stdlib/CMakeLists.txt +++ b/libc/fuzzing/stdlib/CMakeLists.txt @@ -1,9 +1,9 @@ add_libc_fuzzer( - qsort_fuzz + quick_sort_fuzz SRCS - qsort_fuzz.cpp + quick_sort_fuzz.cpp DEPENDS - libc.src.stdlib.qsort + libc.src.stdlib.qsort_util ) add_libc_fuzzer( diff --git a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp index 876c5f9975d4d..6b00306ec7dc1 100644 --- a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp +++ b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp @@ -10,21 +10,10 @@ /// //===----------------------------------------------------------------------===// -#include "src/stdlib/heap_sort.h" +#include "src/stdlib/qsort_util.h" #include -static int int_compare(const void *l, const void *r) { - int li = *reinterpret_cast(l); - int ri = *reinterpret_cast(r); - if (li == ri) - return 0; - if (li > ri) - return 1; - return -1; -} - extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - const size_t array_size = size / sizeof(int); if (array_size == 0) return 0; @@ -34,14 +23,22 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { for (size_t i = 0; i < array_size; ++i) array[i] = data_as_int[i]; - auto arr = LIBC_NAMESPACE::internal::Array( - reinterpret_cast(array), array_size, sizeof(int), int_compare); + const auto is_less = [](const void *a_ptr, + const void *b_ptr) noexcept -> bool { + const int &a = *static_cast(a_ptr); + const int &b = *static_cast(b_ptr); + + return a < b; + }; - LIBC_NAMESPACE::internal::heap_sort(arr); + constexpr bool USE_QUICKSORT = false; + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, sizeof(int), is_less); - for (size_t i = 0; i < array_size - 1; ++i) + for (size_t i = 0; i < array_size - 1; ++i) { if (array[i] > array[i + 1]) __builtin_trap(); + } delete[] array; return 0; diff --git a/libc/fuzzing/stdlib/qsort_fuzz.cpp b/libc/fuzzing/stdlib/quick_sort_fuzz.cpp similarity index 62% rename from libc/fuzzing/stdlib/qsort_fuzz.cpp rename to libc/fuzzing/stdlib/quick_sort_fuzz.cpp index 5d5053cff5c58..6371e851d2fc3 100644 --- a/libc/fuzzing/stdlib/qsort_fuzz.cpp +++ b/libc/fuzzing/stdlib/quick_sort_fuzz.cpp @@ -1,4 +1,4 @@ -//===-- qsort_fuzz.cpp ----------------------------------------------------===// +//===-- quick_sort_fuzz.cpp------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,24 +6,13 @@ // //===----------------------------------------------------------------------===// /// -/// Fuzzing test for llvm-libc qsort implementation. +/// Fuzzing test for llvm-libc quick_sort implementation. /// //===----------------------------------------------------------------------===// -#include "src/stdlib/qsort.h" +#include "src/stdlib/qsort_util.h" #include -static int int_compare(const void *l, const void *r) { - int li = *reinterpret_cast(l); - int ri = *reinterpret_cast(r); - if (li == ri) - return 0; - else if (li > ri) - return 1; - else - return -1; -} - extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { const size_t array_size = size / sizeof(int); if (array_size == 0) @@ -34,7 +23,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { for (size_t i = 0; i < array_size; ++i) array[i] = data_as_int[i]; - LIBC_NAMESPACE::qsort(array, array_size, sizeof(int), int_compare); + const auto is_less = [](const void *a_ptr, + const void *b_ptr) noexcept -> bool { + const int &a = *static_cast(a_ptr); + const int &b = *static_cast(b_ptr); + + return a < b; + }; + + constexpr bool USE_QUICKSORT = true; + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, sizeof(int), is_less); for (size_t i = 0; i < array_size - 1; ++i) { if (array[i] > array[i + 1]) diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h index ccb9ec5f82149..b9699776df89c 100644 --- a/libc/src/stdlib/heap_sort.h +++ b/libc/src/stdlib/heap_sort.h @@ -18,11 +18,12 @@ namespace internal { // A simple in-place heapsort implementation. // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort. -LIBC_INLINE void heap_sort(const Array &array) { - size_t end = array.size(); +template +LIBC_INLINE void heap_sort(const A &array, const F &is_less) { + size_t end = array.len(); size_t start = end / 2; - auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; + const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; while (end > 1) { if (start > 0) { @@ -40,12 +41,11 @@ LIBC_INLINE void heap_sort(const Array &array) { while (left_child(root) < end) { size_t child = left_child(root); // If there are two children, set child to the greater. - if (child + 1 < end && - array.elem_compare(child, array.get(child + 1)) < 0) + if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1))) ++child; // If the root is less than the greater child - if (array.elem_compare(root, array.get(child)) >= 0) + if (!is_less(array.get(root), array.get(child))) break; // Swap the root with the greater child and continue sifting down. diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index 65a63c239f5c0..0bf5fc7980527 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -18,14 +18,12 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, qsort, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *))) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - internal::Comparator c(compare); - auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size, c); + const auto is_less = [compare](const void *a, const void *b) -> bool { + return compare(a, b) < 0; + }; - internal::sort(arr); + internal::unstable_sort(array, array_size, elem_size, is_less); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h index c529d55ca46ff..aa6d9bbc123de 100644 --- a/libc/src/stdlib/qsort_data.h +++ b/libc/src/stdlib/qsort_data.h @@ -17,91 +17,122 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -using Compare = int(const void *, const void *); -using CompareWithState = int(const void *, const void *, void *); - -enum class CompType { COMPARE, COMPARE_WITH_STATE }; - -struct Comparator { - union { - Compare *comp_func; - CompareWithState *comp_func_r; - }; - const CompType comp_type; - - void *arg; - - Comparator(Compare *func) - : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {} - - Comparator(CompareWithState *func, void *arg_val) - : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE), - arg(arg_val) {} - -#if defined(__clang__) - // Recent upstream changes to -fsanitize=function find more instances of - // function type mismatches. One case is with the comparator passed to this - // class. Libraries will tend to pass comparators that take pointers to - // varying types while this comparator expects to accept const void pointers. - // Ideally those tools would pass a function that strictly accepts const - // void*s to avoid UB, or would use qsort_r to pass their own comparator. - [[clang::no_sanitize("function")]] -#endif - int comp_vals(const void *a, const void *b) const { - if (comp_type == CompType::COMPARE) { - return comp_func(a, b); - } else { - return comp_func_r(a, b, arg); +class ArrayGenericSize { + cpp::byte *array_base; + size_t array_len; + size_t elem_size; + + LIBC_INLINE cpp::byte *get_internal(size_t i) const { + return array_base + (i * elem_size); + } + +public: + LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e) + : array_base(reinterpret_cast(a)), array_len(s), + elem_size(e) {} + + static constexpr bool has_fixed_size() { return false; } + + LIBC_INLINE void *get(size_t i) const { return get_internal(i); } + + LIBC_INLINE void swap(size_t i, size_t j) const { + // It's possible to use 8 byte blocks with `uint64_t`, but that + // generates more machine code as the remainder loop gets + // unrolled, plus 4 byte operations are more likely to be + // efficient on a wider variety of hardware. On x86 LLVM tends + // to unroll the block loop again into 2 16 byte swaps per + // iteration which is another reason that 4 byte blocks yields + // good performance even for big types. + using block_t = uint32_t; + constexpr size_t BLOCK_SIZE = sizeof(block_t); + + alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE]; + + cpp::byte *elem_i = get_internal(i); + cpp::byte *elem_j = get_internal(j); + + const size_t elem_size_rem = elem_size % BLOCK_SIZE; + const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem); + + while (elem_i != elem_i_block_end) { + __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE); + __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE); + __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE); + + elem_i += BLOCK_SIZE; + elem_j += BLOCK_SIZE; + } + + for (size_t n = 0; n < elem_size_rem; ++n) { + cpp::byte tmp = elem_i[n]; + elem_i[n] = elem_j[n]; + elem_j[n] = tmp; } } + + LIBC_INLINE size_t len() const { return array_len; } + + // Make an Array starting at index |i| and length |s|. + LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const { + return ArrayGenericSize(get_internal(i), s, elem_size); + } + + // Reset this Array to point at a different interval of the same + // items starting at index |i|. + LIBC_INLINE void reset_bounds(size_t i, size_t s) { + array_base = get_internal(i); + array_len = s; + } }; -class Array { - uint8_t *array; - size_t array_size; - size_t elem_size; - Comparator compare; +// Having a specialized Array type for sorting that knows at +// compile-time what the size of the element is, allows for much more +// efficient swapping and for cheaper offset calculations. +template class ArrayFixedSize { + cpp::byte *array_base; + size_t array_len; -public: - Array(uint8_t *a, size_t s, size_t e, Comparator c) - : array(a), array_size(s), elem_size(e), compare(c) {} - - uint8_t *get(size_t i) const { return array + i * elem_size; } - - void swap(size_t i, size_t j) const { - uint8_t *elem_i = get(i); - uint8_t *elem_j = get(j); - for (size_t b = 0; b < elem_size; ++b) { - uint8_t temp = elem_i[b]; - elem_i[b] = elem_j[b]; - elem_j[b] = temp; - } + LIBC_INLINE cpp::byte *get_internal(size_t i) const { + return array_base + (i * ELEM_SIZE); } - int elem_compare(size_t i, const uint8_t *other) const { - // An element must compare equal to itself so we don't need to consult the - // user provided comparator. - if (get(i) == other) - return 0; - return compare.comp_vals(get(i), other); +public: + LIBC_INLINE ArrayFixedSize(void *a, size_t s) + : array_base(reinterpret_cast(a)), array_len(s) {} + + // Beware this function is used a heuristic for cheap to swap types, so + // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad + // idea perf wise. + static constexpr bool has_fixed_size() { return true; } + + LIBC_INLINE void *get(size_t i) const { return get_internal(i); } + + LIBC_INLINE void swap(size_t i, size_t j) const { + alignas(32) cpp::byte tmp[ELEM_SIZE]; + + cpp::byte *elem_i = get_internal(i); + cpp::byte *elem_j = get_internal(j); + + __builtin_memcpy(tmp, elem_i, ELEM_SIZE); + __builtin_memmove(elem_i, elem_j, ELEM_SIZE); + __builtin_memcpy(elem_j, tmp, ELEM_SIZE); } - size_t size() const { return array_size; } + LIBC_INLINE size_t len() const { return array_len; } - // Make an Array starting at index |i| and size |s|. - LIBC_INLINE Array make_array(size_t i, size_t s) const { - return Array(get(i), s, elem_size, compare); + // Make an Array starting at index |i| and length |s|. + LIBC_INLINE ArrayFixedSize make_array(size_t i, size_t s) const { + return ArrayFixedSize(get_internal(i), s); } - // Reset this Array to point at a different interval of the same items. - LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) { - array = a; - array_size = s; + // Reset this Array to point at a different interval of the same + // items starting at index |i|. + LIBC_INLINE void reset_bounds(size_t i, size_t s) { + array_base = get_internal(i); + array_len = s; } }; -using SortingRoutine = void(const Array &); - } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h new file mode 100644 index 0000000000000..b7e1b4294f6d6 --- /dev/null +++ b/libc/src/stdlib/qsort_pivot.h @@ -0,0 +1,85 @@ +//===-- Implementation header for qsort utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H +#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H + +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +// Recursively select a pseudomedian if above this threshold. +constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64; + +// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters. +// +// This chooses a pivot by sampling an adaptive amount of points, approximating +// the quality of a median of sqrt(n) elements. +template +size_t choose_pivot(const A &array, const F &is_less) { + const size_t len = array.len(); + + if (len < 8) { + return 0; + } + + const size_t len_div_8 = len / 8; + + const size_t a = 0; // [0, floor(n/8)) + const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8)) + const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8)) + + if (len < PSEUDO_MEDIAN_REC_THRESHOLD) + return median3(array, a, b, c, is_less); + else + return median3_rec(array, a, b, c, len_div_8, is_less); +} + +// Calculates an approximate median of 3 elements from sections a, b, c, or +// recursively from an approximation of each, if they're large enough. By +// dividing the size of each section by 8 when recursing we have logarithmic +// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) = +// O(n^(log(3)/log(8))) ~= O(n^0.528) elements. +template +size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n, + const F &is_less) { + if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) { + const size_t n8 = n / 8; + a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less); + b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less); + c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less); + } + return median3(array, a, b, c, is_less); +} + +/// Calculates the median of 3 elements. +template +size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) { + const void *a_ptr = array.get(a); + const void *b_ptr = array.get(b); + const void *c_ptr = array.get(c); + + const bool x = is_less(a_ptr, b_ptr); + const bool y = is_less(a_ptr, c_ptr); + if (x == y) { + // If x=y=0 then b, c <= a. In this case we want to return max(b, c). + // If x=y=1 then a < b, c. In this case we want to return min(b, c). + // By toggling the outcome of b < c using XOR x we get this behavior. + const bool z = is_less(b_ptr, c_ptr); + return z ^ x ? c : b; + } else { + // Either c <= a < b or b <= a < c, thus a is our median. + return a; + } +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index bf61a40e84734..4e60998b6a6df 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -19,13 +19,12 @@ LLVM_LIBC_FUNCTION(void, qsort_r, (void *array, size_t array_size, size_t elem_size, int (*compare)(const void *, const void *, void *), void *arg)) { - if (array == nullptr || array_size == 0 || elem_size == 0) - return; - internal::Comparator c(compare, arg); - auto arr = internal::Array(reinterpret_cast(array), array_size, - elem_size, c); - internal::sort(arr); + const auto is_less = [compare, arg](const void *a, const void *b) -> bool { + return compare(a, b, arg) < 0; + }; + + internal::unstable_sort(array, array_size, elem_size, is_less); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index d42adde06d976..7882b829d3274 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -27,11 +27,48 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT -constexpr auto sort = quick_sort; -#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT -constexpr auto sort = heap_sort; -#endif +template +LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len, + size_t elem_size, const F &is_less) { + if (array == nullptr || array_len == 0 || elem_size == 0) + return; + + if constexpr (USE_QUICKSORT) { + switch (elem_size) { + case 4: { + auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + case 8: { + auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + case 16: { + auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len); + quick_sort(arr_fixed_size, is_less); + return; + } + default: + auto arr_generic_size = + internal::ArrayGenericSize(array, array_len, elem_size); + quick_sort(arr_generic_size, is_less); + return; + } + } else { + auto arr_generic_size = + internal::ArrayGenericSize(array, array_len, elem_size); + heap_sort(arr_generic_size, is_less); + } +} + +template +LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size, + const F &is_less) { +#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT)) + unstable_sort_impl(array, array_len, elem_size, is_less); +} } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h index 82b90a7d511d9..9ab2830250018 100644 --- a/libc/src/stdlib/quick_sort.h +++ b/libc/src/stdlib/quick_sort.h @@ -9,84 +9,175 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H -#include "src/__support/macros/attributes.h" +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/cstddef.h" #include "src/__support/macros/config.h" -#include "src/stdlib/qsort_data.h" +#include "src/stdlib/qsort_pivot.h" #include namespace LIBC_NAMESPACE_DECL { namespace internal { -// A simple quicksort implementation using the Hoare partition scheme. -LIBC_INLINE size_t partition(const Array &array) { - const size_t array_size = array.size(); - size_t pivot_index = array_size / 2; - uint8_t *pivot = array.get(pivot_index); - size_t i = 0; - size_t j = array_size - 1; +// Branchless Lomuto partition based on the implementation by Lukas +// Bergdoll and Orson Peters +// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md. +// Simplified to avoid having to stack allocate. +template +LIBC_INLINE size_t partition_lomuto_branchless(const A &array, + const void *pivot, + const F &is_less) { + const size_t array_len = array.len(); + + size_t left = 0; + size_t right = 0; + + while (right < array_len) { + const bool right_is_lt = is_less(array.get(right), pivot); + array.swap(left, right); + left += static_cast(right_is_lt); + right += 1; + } + + return left; +} + +// Optimized for large types that are expensive to move. Not optimized +// for integers. It's possible to use a cyclic permutation here for +// large types as done in ipnsort but the advantages of this are limited +// as `is_less` is a small wrapper around a call to a function pointer +// and won't incur much binary-size overhead. The other reason to use +// cyclic permutation is to have more efficient swapping, but we don't +// know the element size so this isn't applicable here either. +template +LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot, + const F &is_less) { + const size_t array_len = array.len(); + + size_t left = 0; + size_t right = array_len; while (true) { - int compare_i, compare_j; - - while ((compare_i = array.elem_compare(i, pivot)) < 0) - ++i; - while ((compare_j = array.elem_compare(j, pivot)) > 0) - --j; - - // At some point i will crossover j so we will definitely break out of - // this while loop. - if (i >= j) - return j + 1; - - array.swap(i, j); - - // The pivot itself might have got swapped so we will update the pivot. - if (i == pivot_index) { - pivot = array.get(j); - pivot_index = j; - } else if (j == pivot_index) { - pivot = array.get(i); - pivot_index = i; + while (left < right && is_less(array.get(left), pivot)) + ++left; + + while (true) { + --right; + if (left >= right || is_less(array.get(right), pivot)) { + break; + } } - if (compare_i == 0 && compare_j == 0) { - // If we do not move the pointers, we will end up with an - // infinite loop as i and j will be stuck without advancing. - ++i; - --j; - } + if (left >= right) + break; + + array.swap(left, right); + ++left; + } + + return left; +} + +template +LIBC_INLINE size_t partition(const A &array, size_t pivot_index, + const F &is_less) { + // Place the pivot at the beginning of the array. + if (pivot_index != 0) { + array.swap(0, pivot_index); } + + const A array_without_pivot = array.make_array(1, array.len() - 1); + const void *pivot = array.get(0); + + size_t num_lt; + if constexpr (A::has_fixed_size()) { + // Branchless Lomuto avoid branch misprediction penalties, but + // it also swaps more often which is only faster if the swap is a fast + // constant operation. + num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less); + } else { + num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less); + } + + // Place the pivot between the two partitions. + array.swap(0, num_lt); + + return num_lt; } -LIBC_INLINE void quick_sort(Array array) { +template +LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot, + size_t limit, const F &is_less) { while (true) { - const size_t array_size = array.size(); - if (array_size <= 1) + const size_t array_len = array.len(); + if (array_len <= 1) return; - size_t split_index = partition(array); - if (array_size == 2) - // The partition operation sorts the two element array. + + // If too many bad pivot choices were made, simply fall back to + // heapsort in order to guarantee `O(N x log(N))` worst-case. + if (limit == 0) { + heap_sort(array, is_less); return; + } - // Make Arrays describing the two sublists that still need sorting. - Array left = array.make_array(0, split_index); - Array right = array.make_array(split_index, array.size() - split_index); - - // Recurse to sort the smaller of the two, and then loop round within this - // function to sort the larger. This way, recursive call depth is bounded - // by log2 of the total array size, because every recursive call is sorting - // a list at most half the length of the one in its caller. - if (left.size() < right.size()) { - quick_sort(left); - array.reset_bounds(right.get(0), right.size()); - } else { - quick_sort(right); - array.reset_bounds(left.get(0), left.size()); + limit -= 1; + + const size_t pivot_index = choose_pivot(array, is_less); + + // If the chosen pivot is equal to the predecessor, then it's the smallest + // element in the slice. Partition the slice into elements equal to and + // elements greater than the pivot. This case is usually hit when the slice + // contains many duplicate elements. + if (ancestor_pivot) { + if (!is_less(ancestor_pivot, array.get(pivot_index))) { + const size_t num_lt = + partition(array, pivot_index, + [is_less](const void *a, const void *b) -> bool { + return !is_less(b, a); + }); + + // Continue sorting elements greater than the pivot. We know that + // `num_lt` cont + array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1)); + ancestor_pivot = nullptr; + continue; + } } + + size_t split_index = partition(array, pivot_index, is_less); + + if (array_len == 2) + // The partition operation sorts the two element array. + return; + + // Split the array into `left`, `pivot`, and `right`. + A left = array.make_array(0, split_index); + const void *pivot = array.get(split_index); + const size_t right_start = split_index + 1; + A right = array.make_array(right_start, array.len() - right_start); + + // Recurse into the left side. We have a fixed recursion limit, + // testing shows no real benefit for recursing into the shorter + // side. + quick_sort_impl(left, ancestor_pivot, limit, is_less); + + // Continue with the right side. + array = right; + ancestor_pivot = pivot; } } +constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; } + +template +LIBC_INLINE void quick_sort(A &array, const F &is_less) { + const void *ancestor_pivot = nullptr; + // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. + // The binary OR by one is used to eliminate the zero-check in the logarithm. + const size_t limit = 2 * ilog2((array.len() | 1)); + quick_sort_impl(array, ancestor_pivot, limit, is_less); +} + } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 4ca2043ab4c9b..8cc0428632ba3 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -300,18 +300,6 @@ add_libc_test( libc.src.stdlib.bsearch ) -add_libc_test( - quick_sort_test - SUITE - libc-stdlib-tests - SRCS - quick_sort_test.cpp - HDRS - SortingTest.h - DEPENDS - libc.src.stdlib.qsort_util -) - add_libc_test( heap_sort_test SUITE @@ -321,15 +309,15 @@ add_libc_test( HDRS SortingTest.h DEPENDS - libc.src.stdlib.qsort_util + libc.src.stdlib.qsort ) add_libc_test( - qsort_test + quick_sort_test SUITE libc-stdlib-tests SRCS - qsort_test.cpp + quick_sort_test.cpp HDRS SortingTest.h DEPENDS diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h index d34584e5addf0..034c0e4f1fd01 100644 --- a/libc/test/src/stdlib/SortingTest.h +++ b/libc/test/src/stdlib/SortingTest.h @@ -7,19 +7,19 @@ //===----------------------------------------------------------------------===// #include "src/__support/macros/config.h" -#include "src/stdlib/qsort_data.h" +#include "src/stdlib/qsort.h" #include "test/UnitTest/Test.h" class SortingTest : public LIBC_NAMESPACE::testing::Test { - using Array = LIBC_NAMESPACE::internal::Array; - using Comparator = LIBC_NAMESPACE::internal::Comparator; - using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine; + using SortingRoutine = void (*)(void *array, size_t array_len, + size_t elem_size, + int (*compare)(const void *, const void *)); -public: static int int_compare(const void *l, const void *r) { int li = *reinterpret_cast(l); int ri = *reinterpret_cast(r); + if (li == ri) return 0; else if (li > ri) @@ -28,16 +28,19 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { return -1; } + static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) { + sort_func(reinterpret_cast(array), array_len, sizeof(int), + int_compare); + } + +public: void test_sorted_array(SortingRoutine sort_func) { int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110, 123, 133, 135, 155, 170, 171, 1100, 1110, 1123, 1133, 1135, 1155, 1170, 1171, 11100, 12310}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_LE(array[0], 10); ASSERT_LE(array[1], 23); @@ -69,14 +72,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_reversed_sorted_array(SortingRoutine sort_func) { int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + int_sort(sort_func, array, ARRAY_LEN); - sort_func(arr); - - for (int i = 0; i < int(ARRAY_SIZE - 1); ++i) + for (int i = 0; i < int(ARRAY_LEN - 1); ++i) ASSERT_EQ(array[i], i + 1); } @@ -84,14 +84,11 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); - for (size_t i = 0; i < ARRAY_SIZE; ++i) + for (size_t i = 0; i < ARRAY_LEN; ++i) ASSERT_EQ(array[i], 100); } @@ -99,12 +96,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110, 123, 90, 80, 70, 60, 171, 11, 1, -1, -5, -10, 1155, 1170, 1171, 12, -100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], -100); ASSERT_EQ(array[1], -10); @@ -135,12 +129,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_2(SortingRoutine sort_func) { int array[7] = {10, 40, 45, 55, 35, 23, 60}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 23); @@ -153,12 +144,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_duplicated_1(SortingRoutine sort_func) { int array[6] = {10, 10, 20, 20, 5, 5}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 5); ASSERT_EQ(array[1], 5); @@ -170,12 +158,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_duplicated_2(SortingRoutine sort_func) { int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 10); ASSERT_EQ(array[1], 10); @@ -191,12 +176,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_array_duplicated_3(SortingRoutine sort_func) { int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 20); ASSERT_EQ(array[1], 20); @@ -213,12 +195,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_three_element_1(SortingRoutine sort_func) { int array[3] = {14999024, 0, 3}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); @@ -228,12 +207,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_three_element_2(SortingRoutine sort_func) { int array[3] = {3, 14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); @@ -243,12 +219,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_three_element_3(SortingRoutine sort_func) { int array[3] = {3, 0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 3); @@ -258,12 +231,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_same_three_element(SortingRoutine sort_func) { int array[3] = {12345, 12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); @@ -273,12 +243,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_two_element_1(SortingRoutine sort_func) { int array[] = {14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); @@ -287,12 +254,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_unsorted_two_element_2(SortingRoutine sort_func) { int array[] = {0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); - - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 0); ASSERT_EQ(array[1], 14999024); @@ -301,12 +265,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_same_two_element(SortingRoutine sort_func) { int array[] = {12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); ASSERT_EQ(array[1], 12345); @@ -315,15 +276,76 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { void test_single_element(SortingRoutine sort_func) { int array[] = {12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, - sizeof(int), Comparator(int_compare)); + constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int); - sort_func(arr); + int_sort(sort_func, array, ARRAY_LEN); ASSERT_EQ(array[0], 12345); } + + void test_different_elem_size(SortingRoutine sort_func) { + // Random order of values [0,50) to avoid only testing pre-sorted handling. + // Long enough to reach interesting code. + constexpr uint8_t ARRAY_INITIAL_VALS[] = { + 42, 13, 8, 4, 17, 28, 20, 32, 22, 29, 7, 2, 46, 37, 26, 49, 24, + 38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1, 5, 16, 35, 39, 41, + 14, 23, 3, 9, 6, 27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0}; + + constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS); + constexpr size_t MAX_ELEM_SIZE = 150; + constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE; + + static_assert(ARRAY_LEN < 256); // so we can encode the values. + + // Minimum alignment to test implementation for bugs related to assuming + // incorrect association between alignment and element size. + alignas(1) uint8_t buf[BUF_SIZE]; + + const auto fill_buf = [&buf](size_t elem_size) { + for (size_t i = 0; i < BUF_SIZE; ++i) { + buf[i] = 0; + } + + for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) { + const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i]; + for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) { + buf[buf_i] = elem_val; + buf_i += 1; + } + } + }; + + for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) { + // Fill all bytes with data to ensure mistakes in elem swap are noticed. + fill_buf(elem_size); + + sort_func(reinterpret_cast(buf), ARRAY_LEN, elem_size, + [](const void *a, const void *b) -> int { + const uint8_t a_val = *reinterpret_cast(a); + const uint8_t b_val = *reinterpret_cast(b); + + if (a_val < b_val) { + return -1; + } else if (a_val > b_val) { + return 1; + } else { + return 0; + } + }); + + for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) { + const uint8_t expected_elem_val = static_cast(elem_i); + + for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) { + const uint8_t buf_val = buf[buf_i]; + // Check that every byte in the element has the expected value. + ASSERT_EQ(buf_val, expected_elem_val) + << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n'; + buf_i += 1; + } + } + } + } }; #define LIST_SORTING_TESTS(Name, Func) \ @@ -374,4 +396,7 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test { TEST_F(LlvmLibc##Name##Test, SingleElementArray) { \ test_single_element(Func); \ } \ + TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) { \ + test_different_elem_size(Func); \ + } \ static_assert(true) diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp index d70e3dc2272be..18d4244506ec2 100644 --- a/libc/test/src/stdlib/heap_sort_test.cpp +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -7,10 +7,20 @@ //===----------------------------------------------------------------------===// #include "SortingTest.h" -#include "src/stdlib/heap_sort.h" +#include "src/stdlib/qsort_util.h" -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::internal::heap_sort(array); +void heap_sort(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)) { + + constexpr bool USE_QUICKSORT = false; + + const auto is_less = [compare](const void *a, + const void *b) noexcept -> bool { + return compare(a, b) < 0; + }; + + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, elem_size, is_less); } -LIST_SORTING_TESTS(HeapSort, sort); +LIST_SORTING_TESTS(HeapSort, heap_sort); diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp index 6893fdc7b74c8..f18923618ed5e 100644 --- a/libc/test/src/stdlib/qsort_r_test.cpp +++ b/libc/test/src/stdlib/qsort_r_test.cpp @@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) { ASSERT_LE(array[23], 11100); ASSERT_LE(array[24], 12310); - // This is a sorted list, but there still have to have been at least N + // This is a sorted list, but there still have to have been at least N - 1 // comparisons made. - ASSERT_GE(count, ARRAY_SIZE); + ASSERT_GE(count, ARRAY_SIZE - 1); } TEST(LlvmLibcQsortRTest, ReverseSortedArray) { diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp deleted file mode 100644 index 1e921a86fd1fd..0000000000000 --- a/libc/test/src/stdlib/qsort_test.cpp +++ /dev/null @@ -1,17 +0,0 @@ -//===-- Unittests for qsort -----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "SortingTest.h" -#include "src/stdlib/qsort.h" - -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::qsort(reinterpret_cast(array.get(0)), array.size(), - sizeof(int), SortingTest::int_compare); -} - -LIST_SORTING_TESTS(Qsort, sort); diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp index d6bf77ebfd40d..2832c855370bc 100644 --- a/libc/test/src/stdlib/quick_sort_test.cpp +++ b/libc/test/src/stdlib/quick_sort_test.cpp @@ -1,4 +1,4 @@ -//===-- Unittests for quick sort ------------------------------------------===// +//===-- Unittests for qsort -----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,10 +7,19 @@ //===----------------------------------------------------------------------===// #include "SortingTest.h" -#include "src/stdlib/quick_sort.h" +#include "src/stdlib/qsort_util.h" -void sort(const LIBC_NAMESPACE::internal::Array &array) { - LIBC_NAMESPACE::internal::quick_sort(array); +void quick_sort(void *array, size_t array_size, size_t elem_size, + int (*compare)(const void *, const void *)) { + constexpr bool USE_QUICKSORT = true; + + const auto is_less = [compare](const void *a, + const void *b) noexcept -> bool { + return compare(a, b) < 0; + }; + + LIBC_NAMESPACE::internal::unstable_sort_impl( + array, array_size, elem_size, is_less); } -LIST_SORTING_TESTS(QuickSort, sort); +LIST_SORTING_TESTS(Qsort, quick_sort); diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index e4b4b075705e8..c0f1546912662 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -120,31 +120,23 @@ libc_support_library( ], ) -libc_test( - name = "qsort_test", - srcs = ["qsort_test.cpp"], - libc_function_deps = ["//libc:qsort"], - deps = [ - ":qsort_test_helper", - "//libc:types_size_t", - ], -) - libc_test( name = "quick_sort_test", srcs = ["quick_sort_test.cpp"], + libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", - "//libc:qsort_util", + "//libc:types_size_t", ], ) libc_test( name = "heap_sort_test", srcs = ["heap_sort_test.cpp"], + libc_function_deps = ["//libc:qsort"], deps = [ ":qsort_test_helper", - "//libc:qsort_util", + "//libc:types_size_t", ], ) From 2bbdce9a42f58af4ca917eaba1bf1019ba658fd5 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Sun, 5 Jan 2025 01:10:25 +0100 Subject: [PATCH 170/480] [GlobalISel] Support physical register inputs in nested patterns (#121239) When importing nested patterns, we create InsnMatcher for each pattern and miss them if consider only the top level InsnMatcher. Iterate PhysRegOperands instead. Change the type of PhysRegOperands from DenseMap to SmallMapVector to have stable generation. Also drop PhysRegInputs member from InsnMatcher as there are no users of it. --- .../GlobalISelEmitter/gisel-physreg-input.td | 84 ++++++++++++++++++- .../GlobalISel/GlobalISelMatchTable.cpp | 1 - .../Common/GlobalISel/GlobalISelMatchTable.h | 18 ++-- llvm/utils/TableGen/GlobalISelEmitter.cpp | 8 +- 4 files changed, 94 insertions(+), 17 deletions(-) diff --git a/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td index a05f364eb3f05..1f1b557ace608 100644 --- a/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td +++ b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td @@ -22,6 +22,86 @@ class I Pat> let Pattern = Pat; } +// Try a nested physical register + +// GISEL: GIM_Try, +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE), +// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic, +// GISEL-NEXT: // MIs[0] src0 +// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[0] Operand 1 +// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32, +// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL), +// GISEL-NEXT: // MIs[1] Operand 0 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: // MIs[1] src1 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[1] Operand 2 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID), +// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1, +// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } GPR32:{ *:[i32] }:$src1, SPECIAL:{ *:[i32] })) => (MULM_PHYS GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1) +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY), +// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define), +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL +// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULM_PHYS), +// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0 +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1 +// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1, +// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, +// GISEL-NEXT: // GIR_Coverage, 0, +// GISEL-NEXT: GIR_EraseRootFromParent_Done, +def MULM_PHYS : I<(outs), (ins GPR32:$src0, GPR32:$src1), + [(st GPR32:$src0, (mul GPR32:$src1, SPECIAL))]> { + let Uses = [SPECIAL]; +} + +// Try nested physical registers and check on duplicated copies + +// GISEL: GIM_Try, +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE), +// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic, +// GISEL-NEXT: // MIs[0] src0 +// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[0] Operand 1 +// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32, +// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL), +// GISEL-NEXT: // MIs[1] Operand 0 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: // MIs[1] Operand 1 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID), +// GISEL-NEXT: // MIs[1] Operand 2 +// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID), +// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1, +// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } R0:{ *:[i32] }, SPECIAL:{ *:[i32] })) => (MULMR0_PHYS GPR32:{ *:[i32] }:$src0) +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/2, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY), +// GISEL-NEXT: GIR_AddRegister, /*InsnID*/2, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define), +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/2, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY), +// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::R0), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define), +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/1, // R0 +// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULMR0_PHYS), +// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0 +// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1, +// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, +// GISEL-NEXT: // GIR_Coverage, 1, +// GISEL-NEXT: GIR_EraseRootFromParent_Done, +def MULMR0_PHYS : I<(outs), (ins GPR32:$src0), + [(st GPR32:$src0, (mul R0, SPECIAL))]> { + let Uses = [R0, SPECIAL]; +} + // Try a normal physical register use. // GISEL: GIM_Try, @@ -44,7 +124,7 @@ class I Pat> // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst] // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // src0 // GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, -// GISEL-NEXT: // GIR_Coverage, 0, +// GISEL-NEXT: // GIR_Coverage, 2, // GISEL-NEXT: GIR_EraseRootFromParent_Done, def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0), [(set GPR32:$dst, (add GPR32:$src0, SPECIAL))]> { @@ -73,7 +153,7 @@ def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0), // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst] // GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // SPECIAL // GISEL-NEXT: GIR_RootConstrainSelectedInstOperands, -// GISEL-NEXT: // GIR_Coverage, 1, +// GISEL-NEXT: // GIR_Coverage, 3, // GISEL-NEXT: GIR_EraseRootFromParent_Done, def MUL_PHYS : I<(outs GPR32:$dst), (ins GPR32:$SPECIAL), [(set GPR32:$dst, (mul GPR32:$SPECIAL, SPECIAL))]> { diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp index 619e7a4790c88..a81f2b53f2846 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp @@ -1723,7 +1723,6 @@ OperandMatcher &InstructionMatcher::addPhysRegInput(const Record *Reg, OperandMatcher *OM = new OperandMatcher(*this, OpIdx, "", TempOpIdx); Operands.emplace_back(OM); Rule.definePhysRegOperand(Reg, *OM); - PhysRegInputs.emplace_back(Reg, OpIdx); return *OM; } diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index 48ce71be677c0..8e6de80d6083c 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -19,6 +19,7 @@ #include "Common/CodeGenDAGPatterns.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -492,9 +493,11 @@ class RuleMatcher : public Matcher { /// the renderers. StringMap DefinedOperands; + using PhysRegOperandsTy = SmallMapVector; + /// A map of anonymous physical register operands defined by the matchers that /// may be referenced by the renderers. - DenseMap PhysRegOperands; + PhysRegOperandsTy PhysRegOperands; /// ID for the next instruction variable defined with /// implicitlyDefineInsnVar() @@ -695,6 +698,10 @@ class RuleMatcher : public Matcher { unsigned allocateOutputInsnID() { return NextOutputInsnID++; } unsigned allocateTempRegID() { return NextTempRegID++; } + iterator_range physoperands() const { + return make_range(PhysRegOperands.begin(), PhysRegOperands.end()); + } + iterator_range insnmatchers() { return make_range(Matchers.begin(), Matchers.end()); } @@ -1756,11 +1763,6 @@ class InstructionMatcher final : public PredicateListMatcher { unsigned InsnVarID; bool AllowNumOpsCheck; - /// PhysRegInputs - List list has an entry for each explicitly specified - /// physreg input to the pattern. The first elt is the Register node, the - /// second is the recorded slot number the input pattern match saved it in. - SmallVector, 2> PhysRegInputs; - bool canAddNumOperandsCheck() const { // Add if it's allowed, and: // - We don't have a variadic operand @@ -1802,10 +1804,6 @@ class InstructionMatcher final : public PredicateListMatcher { OperandMatcher &addPhysRegInput(const Record *Reg, unsigned OpIdx, unsigned TempOpIdx); - ArrayRef> getPhysRegInputs() const { - return PhysRegInputs; - } - StringRef getSymbolicName() const { return SymbolicName; } unsigned getNumOperandMatchers() const { return Operands.size(); } diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 9f6d3a506dceb..3b334ea4ce152 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -1429,15 +1429,15 @@ Expected GlobalISelEmitter::createAndImportInstructionRenderer( action_iterator InsertPt = InsertPtOrError.get(); BuildMIAction &DstMIBuilder = *static_cast(InsertPt->get()); - for (auto PhysInput : InsnMatcher.getPhysRegInputs()) { + for (auto PhysOp : M.physoperands()) { InsertPt = M.insertAction( InsertPt, M.allocateOutputInsnID(), &Target.getInstruction(RK.getDef("COPY"))); BuildMIAction &CopyToPhysRegMIBuilder = *static_cast(InsertPt->get()); - CopyToPhysRegMIBuilder.addRenderer( - Target, PhysInput.first, true); - CopyToPhysRegMIBuilder.addRenderer(PhysInput.first); + CopyToPhysRegMIBuilder.addRenderer(Target, + PhysOp.first, true); + CopyToPhysRegMIBuilder.addRenderer(PhysOp.first); } if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst, From 66f16e682f84551552099a45e608fa260b14e3ab Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:14:25 -0800 Subject: [PATCH 171/480] [clang-format][NFC] Add missing config tests for List of Strings (#121451) Also, simplify the existing test for NamespaceMacros. Like the options tested by the added tests, it's also a list of arbitrary strings and initialized to an empty list. (The other existing tests for list of strings either are initialized to a list of one or more strings or require specific strings.) --- clang/unittests/Format/ConfigParseTest.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 9c38dbbc51f0a..1f0beafaad7f7 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -144,6 +144,9 @@ TEST(ConfigParseTest, GetsCorrectBasedOnStyle) { EXPECT_EQ(0, parseConfiguration(TEXT, &Style).value()); \ EXPECT_EQ(VALUE, Style.FIELD) << "Unexpected value after parsing!" +#define CHECK_PARSE_LIST(FIELD) \ + CHECK_PARSE(#FIELD ": [foo]", FIELD, std::vector{"foo"}) + #define CHECK_PARSE_NESTED_VALUE(TEXT, STRUCT, FIELD, VALUE) \ EXPECT_NE(VALUE, Style.STRUCT.FIELD) << "Initial value already the same!"; \ EXPECT_EQ(0, parseConfiguration(#STRUCT ":\n " TEXT, &Style).value()); \ @@ -906,11 +909,15 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("StatementMacros: [QUNUSED, QT_REQUIRE_VERSION]", StatementMacros, std::vector({"QUNUSED", "QT_REQUIRE_VERSION"})); - Style.NamespaceMacros.clear(); - CHECK_PARSE("NamespaceMacros: [TESTSUITE]", NamespaceMacros, - std::vector{"TESTSUITE"}); - CHECK_PARSE("NamespaceMacros: [TESTSUITE, SUITE]", NamespaceMacros, - std::vector({"TESTSUITE", "SUITE"})); + CHECK_PARSE_LIST(JavaImportGroups); + CHECK_PARSE_LIST(Macros); + CHECK_PARSE_LIST(NamespaceMacros); + CHECK_PARSE_LIST(ObjCPropertyAttributeOrder); + CHECK_PARSE_LIST(TableGenBreakingDAGArgOperators); + CHECK_PARSE_LIST(TemplateNames); + CHECK_PARSE_LIST(TypeNames); + CHECK_PARSE_LIST(TypenameMacros); + CHECK_PARSE_LIST(VariableTemplates); Style.WhitespaceSensitiveMacros.clear(); CHECK_PARSE("WhitespaceSensitiveMacros: [STRINGIZE]", From 04610b901f41c4abec169b9a38f1b0a2fde976c1 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:19:46 -0800 Subject: [PATCH 172/480] [clang-format][NFC] Replace SmallVectorImpl with ArrayRef (#121621) --- clang/lib/Format/AffectedRangeManager.cpp | 10 ++-- clang/lib/Format/AffectedRangeManager.h | 4 +- clang/lib/Format/Format.cpp | 8 +-- clang/lib/Format/FormatTokenLexer.cpp | 5 +- clang/lib/Format/UnwrappedLineFormatter.cpp | 64 ++++++++++----------- clang/lib/Format/UnwrappedLineParser.cpp | 9 +-- clang/lib/Format/UnwrappedLineParser.h | 2 +- 7 files changed, 48 insertions(+), 54 deletions(-) diff --git a/clang/lib/Format/AffectedRangeManager.cpp b/clang/lib/Format/AffectedRangeManager.cpp index bf124d73e89e7..67108f3540191 100644 --- a/clang/lib/Format/AffectedRangeManager.cpp +++ b/clang/lib/Format/AffectedRangeManager.cpp @@ -21,8 +21,8 @@ namespace format { bool AffectedRangeManager::computeAffectedLines( SmallVectorImpl &Lines) { - SmallVectorImpl::iterator I = Lines.begin(); - SmallVectorImpl::iterator E = Lines.end(); + ArrayRef::iterator I = Lines.begin(); + ArrayRef::iterator E = Lines.end(); bool SomeLineAffected = false; const AnnotatedLine *PreviousLine = nullptr; while (I != E) { @@ -34,7 +34,7 @@ bool AffectedRangeManager::computeAffectedLines( // if any token within the directive is affected. if (Line->InPPDirective) { FormatToken *Last = Line->Last; - SmallVectorImpl::iterator PPEnd = I + 1; + const auto *PPEnd = I + 1; while (PPEnd != E && !(*PPEnd)->First->HasUnescapedNewline) { Last = (*PPEnd)->Last; ++PPEnd; @@ -89,8 +89,8 @@ bool AffectedRangeManager::affectsLeadingEmptyLines(const FormatToken &Tok) { } void AffectedRangeManager::markAllAsAffected( - SmallVectorImpl::iterator I, - SmallVectorImpl::iterator E) { + ArrayRef::iterator I, + ArrayRef::iterator E) { while (I != E) { (*I)->Affected = true; markAllAsAffected((*I)->Children.begin(), (*I)->Children.end()); diff --git a/clang/lib/Format/AffectedRangeManager.h b/clang/lib/Format/AffectedRangeManager.h index add16bdd7a7c3..eef056fdf0633 100644 --- a/clang/lib/Format/AffectedRangeManager.h +++ b/clang/lib/Format/AffectedRangeManager.h @@ -47,8 +47,8 @@ class AffectedRangeManager { bool affectsLeadingEmptyLines(const FormatToken &Tok); // Marks all lines between I and E as well as all their children as affected. - void markAllAsAffected(SmallVectorImpl::iterator I, - SmallVectorImpl::iterator E); + void markAllAsAffected(ArrayRef::iterator I, + ArrayRef::iterator E); // Determines whether 'Line' is affected by the SourceRanges given as input. // Returns \c true if line or one if its children is affected. diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index e51d7ac2e5b6c..fc60c5ec5eebc 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3085,8 +3085,8 @@ static bool affectsRange(ArrayRef Ranges, unsigned Start, // its current line. // If `Cursor` is not on any #include, `Index` will be UINT_MAX. static std::pair -FindCursorIndex(const SmallVectorImpl &Includes, - const SmallVectorImpl &Indices, unsigned Cursor) { +FindCursorIndex(const ArrayRef &Includes, + const ArrayRef &Indices, unsigned Cursor) { unsigned CursorIndex = UINT_MAX; unsigned OffsetToEOL = 0; for (int i = 0, e = Includes.size(); i != e; ++i) { @@ -3135,7 +3135,7 @@ std::string replaceCRLF(const std::string &Code) { // provided and put on a deleted #include, it will be moved to the remaining // #include in the duplicate #includes. static void sortCppIncludes(const FormatStyle &Style, - const SmallVectorImpl &Includes, + const ArrayRef &Includes, ArrayRef Ranges, StringRef FileName, StringRef Code, tooling::Replacements &Replaces, unsigned *Cursor) { @@ -3378,7 +3378,7 @@ static unsigned findJavaImportGroup(const FormatStyle &Style, // import group, a newline is inserted, and within each import group, a // lexicographic sort based on ASCII value is performed. static void sortJavaImports(const FormatStyle &Style, - const SmallVectorImpl &Imports, + const ArrayRef &Imports, ArrayRef Ranges, StringRef FileName, StringRef Code, tooling::Replacements &Replaces) { unsigned ImportsBeginOffset = Imports.front().Offset; diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 0f8d4940d4369..a1d7eeadec441 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -564,8 +564,7 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef Kinds, if (Tokens.size() < Kinds.size()) return false; - SmallVectorImpl::const_iterator First = - Tokens.end() - Kinds.size(); + const auto *First = Tokens.end() - Kinds.size(); for (unsigned i = 0; i < Kinds.size(); ++i) if (First[i]->isNot(Kinds[i])) return false; @@ -577,7 +576,7 @@ bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) { if (Tokens.size() < Count) return false; - SmallVectorImpl::const_iterator First = Tokens.end() - Count; + const auto *First = Tokens.end() - Count; unsigned AddLength = 0; for (size_t i = 1; i < Count; ++i) { // If there is whitespace separating the token and the previous one, diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index bc6766a47f5c7..2fe4ebd4ff8eb 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -183,9 +183,9 @@ class LevelIndentTracker { unsigned Indent = 0; }; -const FormatToken *getMatchingNamespaceToken( - const AnnotatedLine *Line, - const SmallVectorImpl &AnnotatedLines) { +const FormatToken * +getMatchingNamespaceToken(const AnnotatedLine *Line, + const ArrayRef &AnnotatedLines) { if (!Line->startsWith(tok::r_brace)) return nullptr; size_t StartLineIndex = Line->MatchingOpeningBlockLineIndex; @@ -200,9 +200,9 @@ StringRef getNamespaceTokenText(const AnnotatedLine *Line) { return NamespaceToken ? NamespaceToken->TokenText : StringRef(); } -StringRef getMatchingNamespaceTokenText( - const AnnotatedLine *Line, - const SmallVectorImpl &AnnotatedLines) { +StringRef +getMatchingNamespaceTokenText(const AnnotatedLine *Line, + const ArrayRef &AnnotatedLines) { const FormatToken *NamespaceToken = getMatchingNamespaceToken(Line, AnnotatedLines); return NamespaceToken ? NamespaceToken->TokenText : StringRef(); @@ -241,8 +241,8 @@ class LineJoiner { /// Calculates how many lines can be merged into 1 starting at \p I. unsigned tryFitMultipleLinesInOne(LevelIndentTracker &IndentTracker, - SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E) { + ArrayRef::const_iterator I, + ArrayRef::const_iterator E) { const unsigned Indent = IndentTracker.getIndent(); // Can't join the last line with anything. @@ -614,8 +614,8 @@ class LineJoiner { } unsigned - tryMergeSimplePPDirective(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, + tryMergeSimplePPDirective(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, unsigned Limit) { if (Limit == 0) return 0; @@ -626,8 +626,8 @@ class LineJoiner { return 1; } - unsigned tryMergeNamespace(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, + unsigned tryMergeNamespace(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, unsigned Limit) { if (Limit == 0) return 0; @@ -692,9 +692,10 @@ class LineJoiner { return 2; } - unsigned tryMergeSimpleControlStatement( - SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, unsigned Limit) { + unsigned + tryMergeSimpleControlStatement(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { if (Limit == 0) return 0; if (Style.BraceWrapping.AfterControlStatement == @@ -734,10 +735,9 @@ class LineJoiner { return 1; } - unsigned - tryMergeShortCaseLabels(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, - unsigned Limit) { + unsigned tryMergeShortCaseLabels(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { if (Limit == 0 || I + 1 == E || I[1]->First->isOneOf(tok::kw_case, tok::kw_default)) { return 0; @@ -768,7 +768,7 @@ class LineJoiner { if (Line->First->is(tok::comment)) { if (Level != Line->Level) return 0; - SmallVectorImpl::const_iterator J = I + 2 + NumStmts; + const auto *J = I + 2 + NumStmts; for (; J != E; ++J) { Line = *J; if (Line->InPPDirective != InPPDirective) @@ -789,10 +789,9 @@ class LineJoiner { return NumStmts; } - unsigned - tryMergeSimpleBlock(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, - unsigned Limit) { + unsigned tryMergeSimpleBlock(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { // Don't merge with a preprocessor directive. if (I[1]->Type == LT_PreprocessorDirective) return 0; @@ -974,10 +973,9 @@ class LineJoiner { /// Returns the modified column limit for \p I if it is inside a macro and /// needs a trailing '\'. - unsigned - limitConsideringMacros(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, - unsigned Limit) { + unsigned limitConsideringMacros(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, + unsigned Limit) { if (I[0]->InPPDirective && I + 1 != E && !I[1]->First->HasUnescapedNewline && I[1]->First->isNot(tok::eof)) { return Limit < 2 ? 0 : Limit - 2; @@ -985,15 +983,15 @@ class LineJoiner { return Limit; } - bool nextTwoLinesFitInto(SmallVectorImpl::const_iterator I, + bool nextTwoLinesFitInto(ArrayRef::const_iterator I, unsigned Limit) { if (I[1]->First->MustBreakBefore || I[2]->First->MustBreakBefore) return false; return 1 + I[1]->Last->TotalLength + 1 + I[2]->Last->TotalLength <= Limit; } - bool nextNLinesFitInto(SmallVectorImpl::const_iterator I, - SmallVectorImpl::const_iterator E, + bool nextNLinesFitInto(ArrayRef::const_iterator I, + ArrayRef::const_iterator E, unsigned Limit) { unsigned JoinedLength = 0; for (const auto *J = I + 1; J != E; ++J) { @@ -1034,9 +1032,9 @@ class LineJoiner { const FormatStyle &Style; const AdditionalKeywords &Keywords; - const SmallVectorImpl::const_iterator End; + const ArrayRef::const_iterator End; - SmallVectorImpl::const_iterator Next; + ArrayRef::const_iterator Next; const SmallVectorImpl &AnnotatedLines; }; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 654148a161bd7..5375eef90c579 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -51,9 +51,7 @@ void printLine(llvm::raw_ostream &OS, const UnwrappedLine &Line, << "T=" << (unsigned)I->Tok->getType() << ", OC=" << I->Tok->OriginalColumn << ", \"" << I->Tok->TokenText << "\"] "; - for (SmallVectorImpl::const_iterator - CI = I->Children.begin(), - CE = I->Children.end(); + for (const auto *CI = I->Children.begin(), *CE = I->Children.end(); CI != CE; ++CI) { OS << "\n"; printLine(OS, *CI, (Prefix + " ").str()); @@ -4788,8 +4786,7 @@ void UnwrappedLineParser::nextToken(int LevelDifference) { } void UnwrappedLineParser::distributeComments( - const SmallVectorImpl &Comments, - const FormatToken *NextTok) { + const ArrayRef &Comments, const FormatToken *NextTok) { // Whether or not a line comment token continues a line is controlled by // the method continuesLineCommentSection, with the following caveat: // @@ -5011,7 +5008,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) { namespace { template void pushTokens(Iterator Begin, Iterator End, - llvm::SmallVectorImpl &Into) { + SmallVectorImpl &Into) { for (auto I = Begin; I != End; ++I) { Into.push_back(I->Tok); for (const auto &Child : I->Children) diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index b7daf8d9f4401..8160d5e84186e 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -228,7 +228,7 @@ class UnwrappedLineParser { // NextTok specifies the next token. A null pointer NextTok is supported, and // signifies either the absence of a next token, or that the next token // shouldn't be taken into account for the analysis. - void distributeComments(const SmallVectorImpl &Comments, + void distributeComments(const ArrayRef &Comments, const FormatToken *NextTok); // Adds the comment preceding the next token to unwrapped lines. From 44b83e81b5a48d543bf718907f00a21179ec03a4 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:22:54 -0800 Subject: [PATCH 173/480] [clang-format] Add TT_AfterPPDirective for better annotation (#121622) For now, we only need to annotate the token after #error or #warning. Fixes #117706. --- clang/lib/Format/FormatToken.h | 1 + clang/lib/Format/TokenAnnotator.cpp | 4 ++++ clang/lib/Format/UnwrappedLineParser.cpp | 9 +++++++-- clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 8917049cefb86..0fd3a49c71f9d 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -25,6 +25,7 @@ namespace clang { namespace format { #define LIST_TOKEN_TYPES \ + TYPE(AfterPPDirective) \ TYPE(ArrayInitializerLSquare) \ TYPE(ArraySubscriptLSquare) \ TYPE(AttributeColon) \ diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index b0f570966a63f..fad375733ef84 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4941,6 +4941,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, Right.is(TT_ModulePartitionColon)) { return true; } + + if (Right.is(TT_AfterPPDirective)) + return true; + // No space between import foo:bar but keep a space between import :bar; if (Left.is(tok::identifier) && Right.is(TT_ModulePartitionColon)) return false; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 5375eef90c579..46fd566ae221e 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -1030,6 +1030,12 @@ void UnwrappedLineParser::parsePPDirective() { case tok::pp_pragma: parsePPPragma(); break; + case tok::pp_error: + case tok::pp_warning: + nextToken(); + if (!eof() && Style.isCpp()) + FormatTok->setFinalizedType(TT_AfterPPDirective); + [[fallthrough]]; default: parsePPUnknown(); break; @@ -1209,9 +1215,8 @@ void UnwrappedLineParser::parsePPPragma() { } void UnwrappedLineParser::parsePPUnknown() { - do { + while (!eof()) nextToken(); - } while (!eof()); if (Style.IndentPPDirectives != FormatStyle::PPDIS_None) Line->Level += PPBranchLevel + 1; addUnwrappedLine(); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index d61b9adf4f58c..875feff3d5420 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3635,6 +3635,13 @@ TEST_F(TokenAnnotatorTest, SwitchInMacroArgument) { EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_FunctionLBrace); } +TEST_F(TokenAnnotatorTest, AfterPPDirective) { + auto Tokens = annotate("#error -- My error message"); + + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::minusminus, TT_AfterPPDirective); +} + } // namespace } // namespace format } // namespace clang From c1ea05eaf0fbe4b539952689dbf9f0df716c72e7 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 16:24:41 -0800 Subject: [PATCH 174/480] [clang-format] Don't break short macro call followed by l_paren (#121626) Fixes #105658. --- clang/lib/Format/UnwrappedLineParser.cpp | 4 +++- clang/unittests/Format/FormatTest.cpp | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 46fd566ae221e..39aa37af480c9 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2044,7 +2044,9 @@ void UnwrappedLineParser::parseStructuralElement( ? FormatTok->NewlinesBefore > 0 : CommentsBeforeNextToken.front()->NewlinesBefore > 0; - if (FollowedByNewline && (Text.size() >= 5 || FunctionLike) && + if (FollowedByNewline && + (Text.size() >= 5 || + (FunctionLike && FormatTok->isNot(tok::l_paren))) && tokenCanStartNewLine(*FormatTok) && Text == Text.upper()) { if (PreviousToken->isNot(TT_UntouchableMacroFunc)) PreviousToken->setFinalizedType(TT_FunctionLikeOrFreestandingMacro); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 44b9dba249890..4d48bcacddead 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -5887,6 +5887,11 @@ TEST_F(FormatTest, MacrosWithoutTrailingSemicolon) { verifyFormat("SOME_WEIRD_LOG_MACRO << SomeThing;", "SOME_WEIRD_LOG_MACRO\n" "<< SomeThing;"); + verifyFormat("GGGG(ffff(xxxxxxxxxxxxxxxxxxxx)->yyyyyyyyyyyyyyyyyyyy)(foo);", + "GGGG(ffff(xxxxxxxxxxxxxxxxxxxx)->yyyyyyyyyyyyyyyyyyyy)\n" + "(foo);", + getLLVMStyleWithColumns(60)); + verifyFormat("VISIT_GL_CALL(GenBuffers, void, (GLsizei n, GLuint* buffers), " "(n, buffers))", getChromiumStyle(FormatStyle::LK_Cpp)); From a774adb017256ceae85ec92ce5148ed47e517540 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 4 Jan 2025 17:52:19 -0800 Subject: [PATCH 175/480] Bulk port 64-bit x86 builtins to TableGen (#121043) This PR follows https://github.com/llvm/llvm-project/pull/120831 for x86-64. Similar to that PR, this does a very mechanical port of X86 builtins to TableGen. There is a *lot* of improvement available here to use TableGen more effectively and collapse repeated structures. But those can now be follow-up PRs that restructure *within* the `.td` file. The current structure produces a file that exactly matches the original X-macros except for the differences outlined in https://github.com/llvm/llvm-project/pull/120831: - Horizontal whitespace - `long long` types now use `long long` outside of OpenCL, but switch to `long` in OpenCL where relevant. Otherwise, only the order of builtins change, and no tests regress. --- clang/include/clang/Basic/BuiltinsX86.td | 18 +- clang/include/clang/Basic/BuiltinsX86Base.td | 29 ++ clang/include/clang/Basic/BuiltinsX86_64.def | 253 ---------- clang/include/clang/Basic/BuiltinsX86_64.td | 485 +++++++++++++++++++ clang/include/clang/Basic/CMakeLists.txt | 4 + clang/include/clang/Basic/TargetBuiltins.h | 2 +- clang/lib/Basic/Targets/X86.cpp | 2 +- 7 files changed, 521 insertions(+), 272 deletions(-) create mode 100644 clang/include/clang/Basic/BuiltinsX86Base.td delete mode 100644 clang/include/clang/Basic/BuiltinsX86_64.def create mode 100644 clang/include/clang/Basic/BuiltinsX86_64.td diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 73678bc868bfd..18fc10eb85c02 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -10,23 +10,7 @@ // //===----------------------------------------------------------------------===// -include "clang/Basic/BuiltinsBase.td" - -class X86Builtin : TargetBuiltin { - let Spellings = ["__builtin_ia32_" # NAME]; - let Prototype = prototype; - let EnableOpenCLLong = 1; -} - -class X86NoPrefixBuiltin : TargetBuiltin { - let Spellings = [NAME]; - let Prototype = prototype; -} - -class X86LibBuiltin : TargetLibBuiltin { - let Spellings = [NAME]; - let Prototype = prototype; -} +include "clang/Basic/BuiltinsX86Base.td" def rdpmc : X86Builtin<"unsigned long long int(int)">; def rdtsc : X86Builtin<"unsigned long long int()">; diff --git a/clang/include/clang/Basic/BuiltinsX86Base.td b/clang/include/clang/Basic/BuiltinsX86Base.td new file mode 100644 index 0000000000000..aca39c204516a --- /dev/null +++ b/clang/include/clang/Basic/BuiltinsX86Base.td @@ -0,0 +1,29 @@ +//===--- BuiltinsX86Base.td - X86 Builtin function classes ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86-specific builtin function classes. +// +//===----------------------------------------------------------------------===// + +include "clang/Basic/BuiltinsBase.td" + +class X86Builtin : TargetBuiltin { + let Spellings = ["__builtin_ia32_" # NAME]; + let Prototype = prototype; + let EnableOpenCLLong = 1; +} + +class X86NoPrefixBuiltin : TargetBuiltin { + let Spellings = [NAME]; + let Prototype = prototype; +} + +class X86LibBuiltin : TargetLibBuiltin { + let Spellings = [NAME]; + let Prototype = prototype; +} diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def deleted file mode 100644 index 57928a14b3b39..0000000000000 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ /dev/null @@ -1,253 +0,0 @@ -//===--- BuiltinsX86_64.def - X86-64 Builtin function database --*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the X86-64-specific builtin function database. Users of -// this file must define the BUILTIN macro to make use of this information. -// -//===----------------------------------------------------------------------===// - -// The format of this database matches clang/Basic/Builtins.def. - -#if defined(BUILTIN) && !defined(TARGET_BUILTIN) -# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) -#endif - -#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN) -# define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS) -#endif - -TARGET_HEADER_BUILTIN(_BitScanForward64, "UcUNi*ULLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") -TARGET_HEADER_BUILTIN(_BitScanReverse64, "UcUNi*ULLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") - -TARGET_HEADER_BUILTIN(__mulh, "LLiLLiLLi", "nch", INTRIN_H, ALL_MS_LANGUAGES, "") -TARGET_HEADER_BUILTIN(__umulh, "ULLiULLiULLi", "nch", INTRIN_H, ALL_MS_LANGUAGES, "") -TARGET_HEADER_BUILTIN(_mul128, "LLiLLiLLiLLi*", "nch", INTRIN_H, ALL_MS_LANGUAGES, "") -TARGET_HEADER_BUILTIN(_umul128, "ULLiULLiULLiULLi*", "nch", INTRIN_H, ALL_MS_LANGUAGES, "") - -TARGET_HEADER_BUILTIN(__faststorefence, "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") -TARGET_HEADER_BUILTIN(__shiftleft128, "ULLiULLiULLiUc", "nch", INTRIN_H, ALL_MS_LANGUAGES, "") -TARGET_HEADER_BUILTIN(__shiftright128, "ULLiULLiULLiUc", "nch", INTRIN_H, ALL_MS_LANGUAGES, "") - -TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128, "UcLLiD*LLiLLiLLi*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "cx16") - -TARGET_BUILTIN(__builtin_ia32_readeflags_u64, "UOi", "n", "") -TARGET_BUILTIN(__builtin_ia32_writeeflags_u64, "vUOi", "n", "") -TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "OiV4f", "ncV:128:", "sse") -TARGET_BUILTIN(__builtin_ia32_cvttss2si64, "OiV4f", "ncV:128:", "sse") -TARGET_BUILTIN(__builtin_ia32_cvtsd2si64, "OiV2d", "ncV:128:", "sse2") -TARGET_BUILTIN(__builtin_ia32_cvttsd2si64, "OiV2d", "ncV:128:", "sse2") -TARGET_BUILTIN(__builtin_ia32_movnti64, "vOi*Oi", "n", "sse2") -TARGET_BUILTIN(__builtin_ia32_vec_set_v2di, "V2OiV2OiOiIi", "ncV:128:", "sse4.1") -TARGET_BUILTIN(__builtin_ia32_crc32di, "UOiUOiUOi", "nc", "crc32") -TARGET_BUILTIN(__builtin_ia32_vec_ext_v4di, "OiV4OiIi", "ncV:256:", "avx") -TARGET_BUILTIN(__builtin_ia32_vec_set_v4di, "V4OiV4OiOiIi", "ncV:256:", "avx") -TARGET_BUILTIN(__builtin_ia32_rdfsbase32, "Ui", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_rdfsbase64, "UOi", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_rdgsbase32, "Ui", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_rdgsbase64, "UOi", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_wrfsbase32, "vUi", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_wrfsbase64, "vUOi", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_wrgsbase32, "vUi", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_wrgsbase64, "vUOi", "n", "fsgsbase") -TARGET_BUILTIN(__builtin_ia32_fxrstor64, "vv*", "n", "fxsr") -TARGET_BUILTIN(__builtin_ia32_fxsave64, "vv*", "n", "fxsr") -TARGET_BUILTIN(__builtin_ia32_xsave64, "vv*UOi", "n", "xsave") -TARGET_BUILTIN(__builtin_ia32_xrstor64, "vv*UOi", "n", "xsave") -TARGET_BUILTIN(__builtin_ia32_xsaveopt64, "vv*UOi", "n", "xsaveopt") -TARGET_BUILTIN(__builtin_ia32_xrstors64, "vv*UOi", "n", "xsaves") -TARGET_BUILTIN(__builtin_ia32_xsavec64, "vv*UOi", "n", "xsavec") -TARGET_BUILTIN(__builtin_ia32_xsaves64, "vv*UOi", "n", "xsaves") -TARGET_BUILTIN(__builtin_ia32_incsspq, "vUOi", "n", "shstk") -TARGET_BUILTIN(__builtin_ia32_rdsspq, "UOiUOi", "n", "shstk") -TARGET_BUILTIN(__builtin_ia32_wrssq, "vUOiv*", "n", "shstk") -TARGET_BUILTIN(__builtin_ia32_wrussq, "vUOiv*", "n", "shstk") -TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcUOiUOiUOi*", "nE", "") -TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcUOiUOiUOi*", "nE", "") -TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiUOi*", "n", "rdrnd") -TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiUOi*", "n", "rdseed") -TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "ncE", "lzcnt") -TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "ncE", "bmi") -TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "ncE", "") -TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "ncE", "bmi2") -TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "ncE", "bmi2") -TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "ncE", "bmi2") -TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "ncE", "tbm") -TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcUOiUiIUi", "n", "lwp") -TARGET_BUILTIN(__builtin_ia32_lwpval64, "vUOiUiIUi", "n", "lwp") -TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "OiV2dIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "OiV4fIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvtss2usi64, "UOiV4fIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvttsd2si64, "OiV2dIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvttss2si64, "OiV4fIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvttss2usi64, "UOiV4fIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dOiIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vcvtsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16") -TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16") -TARGET_BUILTIN(__builtin_ia32_vcvtusi642sh, "V8xV8xUOiIi", "ncV:128:", "avx512fp16") -TARGET_BUILTIN(__builtin_ia32_vcvtsi642sh, "V8xV8xOiIi", "ncV:128:", "avx512fp16") -TARGET_BUILTIN(__builtin_ia32_vcvttsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16") -TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16") -TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri") - -// AVX10.2 SATCVT-DS -TARGET_BUILTIN(__builtin_ia32_vcvttsd2sis64, "OiV2dIi", "ncV:128:", "avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vcvttsd2usis64, "UOiV2dIi", "ncV:128:", "avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vcvttss2sis64, "OiV4fIi", "ncV:128:", "avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vcvttss2usis64, "UOiV4fIi", "ncV:128:", "avx10.2-256") - -// UINTR -TARGET_BUILTIN(__builtin_ia32_clui, "v", "n", "uintr") -TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr") -TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr") -TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr") -// USERMSR -TARGET_BUILTIN(__builtin_ia32_urdmsr, "ULLiULLi", "n", "usermsr") -TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr") - -// AMX internal builtin -TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs") -TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs") -TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbuud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16") -TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16") -TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") -TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose") - -TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32") -TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tdpbf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8") -TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8") -TARGET_BUILTIN(__builtin_ia32_tdphbf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8") -TARGET_BUILTIN(__builtin_ia32_tdphf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8") - -// AMX -TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") - -TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs") -TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs") - -TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") -TARGET_BUILTIN(__builtin_ia32_tilestored64, "vIUcv*z", "n", "amx-tile") - -TARGET_BUILTIN(__builtin_ia32_tdpbssd, "vIUcIUcIUc", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbsud, "vIUcIUcIUc", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbusd, "vIUcIUcIUc", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8") -TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16") -TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") - -TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex") -TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex") - -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose") - -TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512") - -// AMX_FP16 FP16 -TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16") - -// AMX FP8 -TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8") -TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8") -TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8") -TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8") - -// AMX TF32 -TARGET_BUILTIN(__builtin_ia32_tmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32") -TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32,amx-transpose") - -TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") -TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") -TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd") - -// RAO-INT -TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint") -TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint") -TARGET_BUILTIN(__builtin_ia32_aor64, "vv*SOi", "n", "raoint") -TARGET_BUILTIN(__builtin_ia32_axor64, "vv*SOi", "n", "raoint") - -// MOVRS -TARGET_BUILTIN(__builtin_ia32_movrsqi, "ScvC*", "n", "movrs") -TARGET_BUILTIN(__builtin_ia32_movrshi, "SsvC*", "n", "movrs") -TARGET_BUILTIN(__builtin_ia32_movrssi, "SivC*", "n", "movrs") -TARGET_BUILTIN(__builtin_ia32_movrsdi, "SLLivC*", "n", "movrs") - -// MOVRS and AVX10.2 -TARGET_BUILTIN(__builtin_ia32_vmovrsb128, "V16cV16cC*", "nV:128:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsb256, "V32cV32cC*", "nV:256:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsb512, "V64cV64cC*", "nV:512:", "movrs,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_vmovrsd128, "V4iV4iC*", "nV:128:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsd256, "V8iV8iC*", "nV:256:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsd512, "V16iV16iC*", "nV:512:", "movrs,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_vmovrsq128, "V2OiV2OiC*", "nV:128:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsq256, "V4OiV4OiC*", "nV:256:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsq512, "V8OiV8OiC*", "nV:512:", "movrs,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_vmovrsw128, "V8sV8sC*", "nV:128:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsw256, "V16sV16sC*", "nV:256:", "movrs,avx10.2-256") -TARGET_BUILTIN(__builtin_ia32_vmovrsw512, "V32sV32sC*", "nV:512:", "movrs,avx10.2-512") - -#undef BUILTIN -#undef TARGET_BUILTIN -#undef TARGET_HEADER_BUILTIN diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td new file mode 100644 index 0000000000000..a6c6ef80eac21 --- /dev/null +++ b/clang/include/clang/Basic/BuiltinsX86_64.td @@ -0,0 +1,485 @@ +//===--- BuiltinsX86_64.td - X86-64 Builtin function database ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86-64-specific builtin function database. +// +//===----------------------------------------------------------------------===// + +include "clang/Basic/BuiltinsX86Base.td" + +let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in { + def _BitScanForward64 : X86LibBuiltin<"unsigned char(msuint32_t *, unsigned long long int)">; + def _BitScanReverse64 : X86LibBuiltin<"unsigned char(msuint32_t *, unsigned long long int)">; +} + +let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in { + def __mulh : X86LibBuiltin<"long long int(long long int, long long int)">; + def __umulh : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int)">; + def _mul128 : X86LibBuiltin<"long long int(long long int, long long int, long long int *)">; + def _umul128 : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int, unsigned long long int *)">; +} + +let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in { + def __faststorefence : X86LibBuiltin<"void()">; +} + +let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in { + def __shiftleft128 : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int, unsigned char)">; + def __shiftright128 : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int, unsigned char)">; +} + +let Features = "cx16", Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in { + def _InterlockedCompareExchange128 : X86LibBuiltin<"unsigned char(long long int volatile *, long long int, long long int, long long int *)">; +} + +let Attributes = [NoThrow] in { + def readeflags_u64 : X86Builtin<"unsigned long long int()">; + def writeeflags_u64 : X86Builtin<"void(unsigned long long int)">; +} + +let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { + def cvtss2si64 : X86Builtin<"long long int(_Vector<4, float>)">; + def cvttss2si64 : X86Builtin<"long long int(_Vector<4, float>)">; +} + +let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { + def cvtsd2si64 : X86Builtin<"long long int(_Vector<2, double>)">; + def cvttsd2si64 : X86Builtin<"long long int(_Vector<2, double>)">; +} + +let Features = "sse2", Attributes = [NoThrow] in { + def movnti64 : X86Builtin<"void(long long int *, long long int)">; +} + +let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { + def vec_set_v2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int, _Constant int)">; +} + +let Features = "crc32", Attributes = [NoThrow, Const] in { + def crc32di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; +} + +let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { + def vec_ext_v4di : X86Builtin<"long long int(_Vector<4, long long int>, _Constant int)">; + def vec_set_v4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int, _Constant int)">; +} + +let Features = "fsgsbase", Attributes = [NoThrow] in { + def rdfsbase32 : X86Builtin<"unsigned int()">; + def rdfsbase64 : X86Builtin<"unsigned long long int()">; + def rdgsbase32 : X86Builtin<"unsigned int()">; + def rdgsbase64 : X86Builtin<"unsigned long long int()">; + def wrfsbase32 : X86Builtin<"void(unsigned int)">; + def wrfsbase64 : X86Builtin<"void(unsigned long long int)">; + def wrgsbase32 : X86Builtin<"void(unsigned int)">; + def wrgsbase64 : X86Builtin<"void(unsigned long long int)">; +} + +let Features = "fxsr", Attributes = [NoThrow] in { + def fxrstor64 : X86Builtin<"void(void *)">; + def fxsave64 : X86Builtin<"void(void *)">; +} + +let Features = "xsave", Attributes = [NoThrow] in { + def xsave64 : X86Builtin<"void(void *, unsigned long long int)">; + def xrstor64 : X86Builtin<"void(void *, unsigned long long int)">; +} + +let Features = "xsaveopt", Attributes = [NoThrow] in { + def xsaveopt64 : X86Builtin<"void(void *, unsigned long long int)">; +} + +let Features = "xsaves", Attributes = [NoThrow] in { + def xrstors64 : X86Builtin<"void(void *, unsigned long long int)">; +} + +let Features = "xsavec", Attributes = [NoThrow] in { + def xsavec64 : X86Builtin<"void(void *, unsigned long long int)">; +} + +let Features = "xsaves", Attributes = [NoThrow] in { + def xsaves64 : X86Builtin<"void(void *, unsigned long long int)">; +} + +let Features = "shstk", Attributes = [NoThrow] in { + def incsspq : X86Builtin<"void(unsigned long long int)">; + def rdsspq : X86Builtin<"unsigned long long int(unsigned long long int)">; + def wrssq : X86Builtin<"void(unsigned long long int, void *)">; + def wrussq : X86Builtin<"void(unsigned long long int, void *)">; +} + +let Attributes = [NoThrow, Constexpr] in { + def addcarryx_u64 : X86Builtin<"unsigned char(unsigned char, unsigned long long int, unsigned long long int, unsigned long long int *)">; + def subborrow_u64 : X86Builtin<"unsigned char(unsigned char, unsigned long long int, unsigned long long int, unsigned long long int *)">; +} + +let Features = "rdrnd", Attributes = [NoThrow] in { + def rdrand64_step : X86Builtin<"unsigned int(unsigned long long int *)">; +} + +let Features = "rdseed", Attributes = [NoThrow] in { + def rdseed64_step : X86Builtin<"unsigned int(unsigned long long int *)">; +} + +let Features = "lzcnt", Attributes = [NoThrow, Const, Constexpr] in { + def lzcnt_u64 : X86Builtin<"unsigned long long int(unsigned long long int)">; +} + +let Features = "bmi", Attributes = [NoThrow, Const, Constexpr] in { + def bextr_u64 : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; +} + +let Attributes = [NoThrow, Const, Constexpr] in { + def tzcnt_u64 : X86Builtin<"unsigned long long int(unsigned long long int)">; +} + +let Features = "bmi2", Attributes = [NoThrow, Const, Constexpr] in { + def bzhi_di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; + def pdep_di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; + def pext_di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; +} + +let Features = "tbm", Attributes = [NoThrow, Const, Constexpr] in { + def bextri_u64 : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned long long int)">; +} + +let Features = "lwp", Attributes = [NoThrow] in { + def lwpins64 : X86Builtin<"unsigned char(unsigned long long int, unsigned int, _Constant unsigned int)">; + def lwpval64 : X86Builtin<"void(unsigned long long int, unsigned int, _Constant unsigned int)">; +} + +let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { + def vcvtsd2si64 : X86Builtin<"long long int(_Vector<2, double>, _Constant int)">; + def vcvtsd2usi64 : X86Builtin<"unsigned long long int(_Vector<2, double>, _Constant int)">; + def vcvtss2si64 : X86Builtin<"long long int(_Vector<4, float>, _Constant int)">; + def vcvtss2usi64 : X86Builtin<"unsigned long long int(_Vector<4, float>, _Constant int)">; + def vcvttsd2si64 : X86Builtin<"long long int(_Vector<2, double>, _Constant int)">; + def vcvttsd2usi64 : X86Builtin<"unsigned long long int(_Vector<2, double>, _Constant int)">; + def vcvttss2si64 : X86Builtin<"long long int(_Vector<4, float>, _Constant int)">; + def vcvttss2usi64 : X86Builtin<"unsigned long long int(_Vector<4, float>, _Constant int)">; + def cvtsi2sd64 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, long long int, _Constant int)">; + def cvtsi2ss64 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, long long int, _Constant int)">; + def cvtusi2sd64 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, unsigned long long int, _Constant int)">; + def cvtusi2ss64 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, unsigned long long int, _Constant int)">; +} + +let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { + def vcvtsh2si64 : X86Builtin<"long long int(_Vector<8, _Float16>, _Constant int)">; + def vcvtsh2usi64 : X86Builtin<"unsigned long long int(_Vector<8, _Float16>, _Constant int)">; + def vcvtusi642sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, unsigned long long int, _Constant int)">; + def vcvtsi642sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, long long int, _Constant int)">; + def vcvttsh2si64 : X86Builtin<"long long int(_Vector<8, _Float16>, _Constant int)">; + def vcvttsh2usi64 : X86Builtin<"unsigned long long int(_Vector<8, _Float16>, _Constant int)">; +} + +let Features = "movdiri", Attributes = [NoThrow] in { + def directstore_u64 : X86Builtin<"void(unsigned long int *, unsigned long int)">; +} + +let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { + def vcvttsd2sis64 : X86Builtin<"long long int(_Vector<2, double>, _Constant int)">; + def vcvttsd2usis64 : X86Builtin<"unsigned long long int(_Vector<2, double>, _Constant int)">; + def vcvttss2sis64 : X86Builtin<"long long int(_Vector<4, float>, _Constant int)">; + def vcvttss2usis64 : X86Builtin<"unsigned long long int(_Vector<4, float>, _Constant int)">; +} + +let Features = "uintr", Attributes = [NoThrow] in { + def clui : X86Builtin<"void()">; + def stui : X86Builtin<"void()">; + def testui : X86Builtin<"unsigned char()">; + def senduipi : X86Builtin<"void(uint64_t)">; +} + +let Features = "usermsr", Attributes = [NoThrow] in { + def urdmsr : X86Builtin<"unsigned long long int(unsigned long long int)">; + def uwrmsr : X86Builtin<"void(unsigned long long int, unsigned long long int)">; +} + +let Features = "amx-tile", Attributes = [NoThrow] in { + def tile_loadconfig_internal : X86Builtin<"void(void const *)">; + def tileloadd64_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">; +} + +let Features = "amx-movrs", Attributes = [NoThrow] in { + def tileloaddrs64_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">; +} + +let Features = "amx-tile", Attributes = [NoThrow] in { + def tileloaddt164_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">; +} + +let Features = "amx-movrs", Attributes = [NoThrow] in { + def tileloaddrst164_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">; +} + +let Features = "amx-int8", Attributes = [NoThrow] in { + def tdpbssd_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tdpbsud_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tdpbusd_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tdpbuud_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-tile", Attributes = [NoThrow] in { + def tilestored64_internal : X86Builtin<"void(unsigned short, unsigned short, void *, size_t, _Vector<256, int>)">; + def tilezero_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short)">; +} + +let Features = "amx-bf16", Attributes = [NoThrow] in { + def tdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-fp16", Attributes = [NoThrow] in { + def tdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-complex", Attributes = [NoThrow] in { + def tcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; +} + +let Features = "amx-transpose", Attributes = [NoThrow] in { + def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">; +} + +let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in { + def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in { + def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in { + def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">; +} + +let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in { + def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; + def tcvtrowps2pbf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; + def tcvtrowps2pbf16l_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; + def tcvtrowps2phh_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; + def tcvtrowps2phl_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; + def tilemovrow_internal : X86Builtin<"_Vector<16, int>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; +} + +let Features = "amx-tf32", Attributes = [NoThrow] in { + def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in { + def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-fp8", Attributes = [NoThrow] in { + def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tdphbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; + def tdphf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; +} + +let Features = "amx-tile", Attributes = [NoThrow] in { + def tile_loadconfig : X86Builtin<"void(void const *)">; + def tile_storeconfig : X86Builtin<"void(void const *)">; + def tilerelease : X86Builtin<"void()">; + def tilezero : X86Builtin<"void(unsigned char)">; +} + +let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; +} + +let Features = "amx-movrs", Attributes = [NoThrow] in { + def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; +} + +let Features = "amx-tile", Attributes = [NoThrow] in { + def tileloadd64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def tileloaddt164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def tilestored64 : X86Builtin<"void(_Constant unsigned char, void *, size_t)">; +} + +let Features = "amx-int8", Attributes = [NoThrow] in { + def tdpbssd : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def tdpbsud : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def tdpbusd : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def tdpbuud : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-bf16", Attributes = [NoThrow] in { + def tdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "ptwrite", Attributes = [NoThrow] in { + def ptwrite64 : X86Builtin<"void(unsigned long long int)">; +} + +let Features = "amx-complex", Attributes = [NoThrow] in { + def tcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-transpose", Attributes = [NoThrow] in { + def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; + def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in { + def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in { + def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in { + def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; + def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in { + def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">; + def tcvtrowps2pbf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">; + def tcvtrowps2pbf16l : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">; + def tcvtrowps2phh : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">; + def tcvtrowps2phl : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">; + def tilemovrow : X86Builtin<"_Vector<16, int>(_Constant unsigned char, unsigned int)">; +} + +let Features = "amx-fp16", Attributes = [NoThrow] in { + def tdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-fp8", Attributes = [NoThrow] in { + def tdpbf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">; + def tdpbhf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">; + def tdphbf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">; + def tdphf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">; +} + +let Features = "amx-tf32", Attributes = [NoThrow] in { + def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in { + def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; +} + +let Features = "prefetchi", Attributes = [NoThrow, Const] in { + def prefetchi : X86Builtin<"void(void const *, unsigned int)">; +} + +let Features = "cmpccxadd", Attributes = [NoThrow] in { + def cmpccxadd32 : X86Builtin<"signed int(void *, signed int, signed int, _Constant int)">; + def cmpccxadd64 : X86Builtin<"signed long long int(signed long long int *, signed long long int, signed long long int, _Constant int)">; +} + +let Features = "raoint", Attributes = [NoThrow] in { + def aadd64 : X86Builtin<"void(void *, signed long long int)">; + def aand64 : X86Builtin<"void(void *, signed long long int)">; + def aor64 : X86Builtin<"void(void *, signed long long int)">; + def axor64 : X86Builtin<"void(void *, signed long long int)">; +} + +let Features = "movrs", Attributes = [NoThrow] in { + def movrsqi : X86Builtin<"signed char(void const *)">; + def movrshi : X86Builtin<"signed short(void const *)">; + def movrssi : X86Builtin<"signed int(void const *)">; + def movrsdi : X86Builtin<"signed long long int(void const *)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { + def vmovrsb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { + def vmovrsb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>)">; +} + +let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { + def vmovrsb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { + def vmovrsd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { + def vmovrsd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>)">; +} + +let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { + def vmovrsd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { + def vmovrsq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { + def vmovrsq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>)">; +} + +let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { + def vmovrsq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in { + def vmovrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>)">; +} + +let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in { + def vmovrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>)">; +} + +let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in { + def vmovrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>)">; +} diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 76ac3367e23a6..1ccc73892fe6e 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -64,6 +64,10 @@ clang_tablegen(BuiltinsX86.inc -gen-clang-builtins SOURCE BuiltinsX86.td TARGET ClangBuiltinsX86) +clang_tablegen(BuiltinsX86_64.inc -gen-clang-builtins + SOURCE BuiltinsX86_64.td + TARGET ClangBuiltinsX86_64) + # ARM NEON and MVE clang_tablegen(arm_neon.inc -gen-arm-neon-sema SOURCE arm_neon.td diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index 556332dd4b217..914be3691ee81 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -128,7 +128,7 @@ namespace clang { FirstX86_64Builtin, LastX86CommonBuiltin = FirstX86_64Builtin - 1, #define BUILTIN(ID, TYPE, ATTRS) BI##ID, -#include "clang/Basic/BuiltinsX86_64.def" +#include "clang/Basic/BuiltinsX86_64.inc" LastTSBuiltin }; } diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 7e5a5c78aa6b5..d2d92fb864c31 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -38,7 +38,7 @@ static constexpr Builtin::Info BuiltinInfoX86[] = { {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, #define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \ {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::HEADER, LANGS}, -#include "clang/Basic/BuiltinsX86_64.def" +#include "clang/Basic/BuiltinsX86_64.inc" }; static const char *const GCCRegNames[] = { From ddba0365b33da70543f9add3037d7402371014c9 Mon Sep 17 00:00:00 2001 From: Haohai Wen Date: Sun, 5 Jan 2025 10:08:29 +0800 Subject: [PATCH 176/480] [LLD] Track cg_profile from combination of obj and ordering file (#121324) Add tests to track section reordering when both cg_profile section and call-graph-ordering-file were given. --- lld/test/COFF/cgprofile-obj.s | 19 +++++++++++++------ lld/test/ELF/cgprofile-obj.s | 18 +++++++++++++----- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/lld/test/COFF/cgprofile-obj.s b/lld/test/COFF/cgprofile-obj.s index b267850c46382..3cb16df80eb46 100644 --- a/lld/test/COFF/cgprofile-obj.s +++ b/lld/test/COFF/cgprofile-obj.s @@ -2,9 +2,12 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t # RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab -# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=CG-OBJ # RUN: lld-link /call-graph-profile-sort:no /subsystem:console /entry:A %t /out:%t3 /debug:symtab # RUN: llvm-nm --numeric-sort %t3 | FileCheck %s --check-prefix=NO-CG +# RUN: echo "D A 200" > %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /out:%t4 /debug:symtab /call-graph-ordering-file:%t.call_graph +# RUN: llvm-nm --numeric-sort %t4 | FileCheck %s --check-prefix=CG-OBJ-OF .section .text,"ax", one_only, D D: @@ -33,13 +36,17 @@ Aa: .cg_profile B, C, 30 .cg_profile C, D, 90 -# CHECK: 140001000 T A -# CHECK: 140001001 T B -# CHECK: 140001002 T C -# CHECK: 140001003 t D - +# CG-OBJ: 140001000 T A +# CG-OBJ: 140001001 T B +# CG-OBJ: 140001002 T C +# CG-OBJ: 140001003 t D # NO-CG: 140001000 t D # NO-CG: 140001001 T C # NO-CG: 140001002 T B # NO-CG: 140001003 T A + +# CG-OBJ-OF: 140001000 T C +# CG-OBJ-OF: 140001001 t D +# CG-OBJ-OF: 140001002 T A +# CG-OBJ-OF: 140001003 T B diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s index 0848adc5e4279..82c5d035540a9 100644 --- a/lld/test/ELF/cgprofile-obj.s +++ b/lld/test/ELF/cgprofile-obj.s @@ -2,12 +2,15 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o # RUN: ld.lld -e A %t.o -o %t -# RUN: llvm-nm --no-sort %t | FileCheck %s +# RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=CG-OBJ # RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG ## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none. # RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1 # RUN: cmp %t %t1 +# RUN: echo "D A 200" > %t.call_graph +# RUN: ld.lld -e A %t.o -call-graph-ordering-file=%t.call_graph -o %t2 +# RUN: llvm-nm --no-sort %t2 | FileCheck %s --check-prefix=CG-OBJ-OF .section .text.D,"ax",@progbits D: @@ -36,12 +39,17 @@ Aa: .cg_profile B, C, 30 .cg_profile C, D, 90 -# CHECK: 0000000000201123 t D -# CHECK: 0000000000201122 T C -# CHECK: 0000000000201121 T B -# CHECK: 0000000000201120 T A +# CG-OBJ: 0000000000201123 t D +# CG-OBJ: 0000000000201122 T C +# CG-OBJ: 0000000000201121 T B +# CG-OBJ: 0000000000201120 T A # NO-CG: 0000000000201120 t D # NO-CG: 0000000000201121 T C # NO-CG: 0000000000201122 T B # NO-CG: 0000000000201123 T A + +# CG-OBJ-OF: 0000000000201121 t D +# CG-OBJ-OF: 0000000000201120 T C +# CG-OBJ-OF: 0000000000201123 T B +# CG-OBJ-OF: 0000000000201122 T A From 36dd421355791c9c9f6552f8730b36b8953ce5db Mon Sep 17 00:00:00 2001 From: JaydeepChauhan14 Date: Sun, 5 Jan 2025 07:53:37 +0530 Subject: [PATCH 177/480] [X86][AVX10.2] Map vector saturated converts to public intrinsics (#121483) We already have support for saturated convert ISA in llvm. With this patch we mapped public llvm intrinsic onto saturated convert ISA. It includes support for float, double into sign and unsigned int. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 66 +++++++- llvm/lib/Target/X86/X86ISelLowering.h | 4 + llvm/lib/Target/X86/X86InstrAVX10.td | 64 +++++++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 7 + .../CodeGen/X86/avx10_2_512fptosi_satcvtds.ll | 85 ++++++++++ .../CodeGen/X86/avx10_2fptosi_satcvtds.ll | 158 +++++++++++++++++- 6 files changed, 380 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a0514e93d6598..07b9a30b57564 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -341,8 +341,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } if (Subtarget.hasAVX10_2()) { - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Legal); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom); + for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, + MVT::v4i64}) { + setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal); + setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal); + } + if (Subtarget.hasAVX10_2_512()) { + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal); + } if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal); @@ -2656,6 +2665,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, + ISD::FP_TO_SINT_SAT, + ISD::FP_TO_UINT_SAT, ISD::SETCC, ISD::MUL, ISD::XOR, @@ -33665,6 +33676,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: { + if (!Subtarget.hasAVX10_2()) + return; + + bool IsSigned = Opc == ISD::FP_TO_SINT_SAT; + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + SDValue Res; + + if (VT == MVT::v2i32 && OpVT == MVT::v2f64) { + if (IsSigned) + Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op); + else + Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op); + Results.push_back(Res); + } + return; + } case ISD::FP_TO_SINT: case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_UINT: @@ -34645,6 +34676,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VPERMV3) NODE_NAME_CASE(VPERMI) NODE_NAME_CASE(VPTERNLOG) + NODE_NAME_CASE(FP_TO_SINT_SAT) + NODE_NAME_CASE(FP_TO_UINT_SAT) NODE_NAME_CASE(VFIXUPIMM) NODE_NAME_CASE(VFIXUPIMM_SAE) NODE_NAME_CASE(VFIXUPIMMS) @@ -56202,6 +56235,33 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS +static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX10_2()) + return SDValue(); + + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT; + EVT SrcVT = N->getOperand(0).getValueType(); + EVT DstVT = N->getValueType(0); + SDLoc dl(N); + + if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) { + SDValue V2F32Value = DAG.getUNDEF(SrcVT); + + // Concatenate the original v2f32 input and V2F32Value to create v4f32 + SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), V2F32Value); + + // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node + if (IsSigned) + return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc); + + return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc); + } + return SDValue(); +} + static bool needCarryOrOverflowFlag(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); @@ -59315,6 +59375,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI); case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI); case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget); // clang-format on } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 2b7a8eaf249d8..eaedaa0b88d22 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -908,6 +908,10 @@ namespace llvm { // Load x87 FPU environment from memory. FLDENVm, + // Custom handling for FP_TO_xINT_SAT + FP_TO_SINT_SAT, + FP_TO_UINT_SAT, + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index cda6998778bc4..127016184bc17 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -834,6 +834,70 @@ let Predicates = [HasAVX10_2] in { // patterns have been disabled with null_frag. // Patterns VCVTTPD2DQSZ128 +// VCVTTPD2DQS +def : Pat<(v4i32(X86fp2sisat(v2f64 VR128X:$src))), + (VCVTTPD2DQSZ128rr VR128X:$src)>; +def : Pat<(v4i32(fp_to_sint_sat(v4f64 VR256X:$src), i32)), + (VCVTTPD2DQSZ256rr VR256X:$src)>; +def : Pat<(v8i32(fp_to_sint_sat(v8f64 VR512:$src), i32)), + (VCVTTPD2DQSZrr VR512:$src)>; + +// VCVTTPD2QQS +def : Pat<(v2i64(fp_to_sint_sat(v2f64 VR128X:$src), i64)), + (VCVTTPD2QQSZ128rr VR128X:$src)>; +def : Pat<(v4i64(fp_to_sint_sat(v4f64 VR256X:$src), i64)), + (VCVTTPD2QQSZ256rr VR256X:$src)>; +def : Pat<(v8i64(fp_to_sint_sat(v8f64 VR512:$src), i64)), + (VCVTTPD2QQSZrr VR512:$src)>; + +// VCVTTPD2UDQS +def : Pat<(v4i32(X86fp2uisat(v2f64 VR128X:$src))), + (VCVTTPD2UDQSZ128rr VR128X:$src)>; +def : Pat<(v4i32(fp_to_uint_sat(v4f64 VR256X:$src), i32)), + (VCVTTPD2UDQSZ256rr VR256X:$src)>; +def : Pat<(v8i32(fp_to_uint_sat(v8f64 VR512:$src), i32)), + (VCVTTPD2UDQSZrr VR512:$src)>; + +// VCVTTPD2UQQS +def : Pat<(v2i64(fp_to_uint_sat(v2f64 VR128X:$src), i64)), + (VCVTTPD2UQQSZ128rr VR128X:$src)>; +def : Pat<(v4i64(fp_to_uint_sat(v4f64 VR256X:$src), i64)), + (VCVTTPD2UQQSZ256rr VR256X:$src)>; +def : Pat<(v8i64(fp_to_uint_sat(v8f64 VR512:$src), i64)), + (VCVTTPD2UQQSZrr VR512:$src)>; + +// VCVTTPS2DQS +def : Pat<(v4i32(fp_to_sint_sat(v4f32 VR128X:$src), i32)), + (VCVTTPS2DQSZ128rr VR128X:$src)>; +def : Pat<(v8i32(fp_to_sint_sat(v8f32 VR256X:$src), i32)), + (VCVTTPS2DQSZ256rr VR256X:$src)>; +def : Pat<(v16i32(fp_to_sint_sat(v16f32 VR512:$src), i32)), + (VCVTTPS2DQSZrr VR512:$src)>; + +// VCVTTPS2QQS +def : Pat<(v2i64(X86fp2sisat(v4f32 VR128X:$src))), + (VCVTTPS2QQSZ128rr VR128X:$src)>; +def : Pat<(v4i64(fp_to_sint_sat(v4f32 VR128X:$src), i64)), + (VCVTTPS2QQSZ256rr VR128X:$src)>; +def : Pat<(v8i64(fp_to_sint_sat(v8f32 VR256X:$src), i64)), + (VCVTTPS2QQSZrr VR256X:$src)>; + +// VCVTTPS2UDQS +def : Pat<(v4i32(fp_to_uint_sat(v4f32 VR128X:$src), i32)), + (VCVTTPS2UDQSZ128rr VR128X:$src)>; +def : Pat<(v8i32(fp_to_uint_sat(v8f32 VR256X:$src), i32)), + (VCVTTPS2UDQSZ256rr VR256X:$src)>; +def : Pat<(v16i32(fp_to_uint_sat(v16f32 VR512:$src), i32)), + (VCVTTPS2UDQSZrr VR512:$src)>; + +// VCVTTPS2UQQS +def : Pat<(v2i64(X86fp2uisat(v4f32 VR128X:$src))), + (VCVTTPS2UQQSZ128rr VR128X:$src)>; +def : Pat<(v4i64(fp_to_uint_sat(v4f32 VR128X:$src), i64)), + (VCVTTPS2UQQSZ256rr VR128X:$src)>; +def : Pat<(v8i64(fp_to_uint_sat(v8f32 VR256X:$src), i64)), + (VCVTTPS2UQQSZrr VR256X:$src)>; + def : Pat<(v4i32 (X86cvttp2sis (v2f64 VR128X:$src))), (VCVTTPD2DQSZ128rr VR128X:$src)>; def : Pat<(v4i32 (X86cvttp2sis (loadv2f64 addr:$src))), diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index f6231b78f4c2e..af0267a7d32c3 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -390,6 +390,13 @@ def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisFP<0>, SDTCisVT<4, i32>]>; +def SDTFPToxIntSatOp + : SDTypeProfile<1, + 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>]>; + +def X86fp2sisat : SDNode<"X86ISD::FP_TO_SINT_SAT", SDTFPToxIntSatOp>; +def X86fp2uisat : SDNode<"X86ISD::FP_TO_UINT_SAT", SDTFPToxIntSatOp>; + def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll new file mode 100644 index 0000000000000..d7ad7b048c6d6 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64 + +; VCVTTPD2DQS +define <8 x i32> @test_signed_v8i32_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v8i32_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dqs %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f) + ret <8 x i32> %x +} + +; VCVTTPD2QQS +define <8 x i64> @test_signed_v8i64_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v8i64_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> %f) + ret <8 x i64> %x +} + +; VCVTTPD2UDQS +define <8 x i32> @test_unsigned_v8i32_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i32_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udqs %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> %f) + ret <8 x i32> %x +} + +; VCVTTPD2UQQS +define <8 x i64> @test_unsigned_v8i64_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i64_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> %f) + ret <8 x i64> %x +} + +; VCVTTPS2DQS +define <16 x i32> @test_signed_v16i32_v16f32(<16 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v16i32_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> %f) + ret <16 x i32> %x +} + +; VCVTTPS2UDQS +define <16 x i32> @test_unsigned_v16i32_v16f32(<16 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v16i32_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> %f) + ret <16 x i32> %x +} +; VCVTTPS2QQS +define <8 x i64> @test_signed_v8i64_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v8i64_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qqs %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> %f) + ret <8 x i64> %x +} + +; VCVTTPS2UQQS +define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i64_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqqs %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f) + ret <8 x i64> %x +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64: {{.*}} +; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll index 494e4bc8e068e..a2f167e94cc23 100644 --- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll +++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64 ; ; 32-bit float to signed integer @@ -112,3 +112,157 @@ define i64 @test_signed_i64_f64(double %f) nounwind { %x = call i64 @llvm.fptosi.sat.i64.f64(double %f) ret i64 %x } + +; VCVTTPD2DQS +define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %d) nounwind { +; CHECK-LABEL: test_signed_v2i32_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %d) + ret <2 x i32> %x +} + +define <4 x i32> @test_signed_v4i32_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v4i32_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dqs %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> %f) + ret <4 x i32> %x +} + +; VCVTTPD2QQS +define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v2i64_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_signed_v4i64_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v4i64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> %f) + ret <4 x i64> %x +} + +; VCVTTPD2UDQS +define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %d) nounwind { +; CHECK-LABEL: test_unsigned_v2i32_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %d) + ret <2 x i32> %x +} + +define <4 x i32> @test_unsigned_v4i32_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i32_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udqs %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> %f) + ret <4 x i32> %x +} + +; VCVTTPD2UQQS +define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v2i64_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_unsigned_v4i64_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> %f) + ret <4 x i64> %x +} + +; VCVTTPS2DQS +define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v4i32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f) + ret <4 x i32> %x +} + +define <8 x i32> @test_signed_v8i32_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v8i32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> %f) + ret <8 x i32> %x +} + +; VCVTTPS2UDQS +define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f) + ret <4 x i32> %x +} + +define <8 x i32> @test_unsigned_v8i32_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> %f) + ret <8 x i32> %x +} + +; VCVTTPS2QQS +define <2 x i64> @test_signed_v2i64_v2f32(<2 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v2i64_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v4i64_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qqs %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> %f) + ret <4 x i64> %x +} + +; VCVTTPS2UQQS +define <2 x i64> @test_unsigned_v2i64_v2f32(<2 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v2i64_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i64_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqqs %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f) + ret <4 x i64> %x +} From 2d9d291da0fb94c05b31de0b08ccb881dbead798 Mon Sep 17 00:00:00 2001 From: Haohai Wen Date: Sun, 5 Jan 2025 10:38:14 +0800 Subject: [PATCH 178/480] [LLD] Do not combine cg_profile from obj and ordering file (#121325) cg_profile in object is from CGProfilePass and it is often inaccurate. While call-graph-ordering-file is provided by user. It is weird to aggregate them together especially when call-graph-ordering-file is accurate enough. --- lld/COFF/Driver.cpp | 6 +++--- lld/ELF/Driver.cpp | 5 +++-- lld/test/COFF/cgprofile-obj.s | 8 ++++---- lld/test/ELF/cgprofile-obj.s | 8 ++++---- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 83d3f5d4cf99c..791382fd9bdd4 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2878,10 +2878,10 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Handle /call-graph-ordering-file and /call-graph-profile-sort (default on). if (config->callGraphProfileSort) { llvm::TimeTraceScope timeScope("Call graph"); - if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) { + if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) parseCallGraphFile(arg->getValue()); - } - readCallGraphsFromObjectFiles(ctx); + else + readCallGraphsFromObjectFiles(ctx); } // Handle /print-symbol-order. diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index f573a8d3e19f3..e8e99fa874b5d 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -3215,11 +3215,12 @@ template void LinkerDriver::link(opt::InputArgList &args) { // Read the callgraph now that we know what was gced or icfed if (ctx.arg.callGraphProfileSort != CGProfileSortKind::None) { - if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) + if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) { if (std::optional buffer = readFile(ctx, arg->getValue())) readCallGraph(ctx, *buffer); - readCallGraphsFromObjectFiles(ctx); + } else + readCallGraphsFromObjectFiles(ctx); } // Write the result to the file. diff --git a/lld/test/COFF/cgprofile-obj.s b/lld/test/COFF/cgprofile-obj.s index 3cb16df80eb46..756279a8b5759 100644 --- a/lld/test/COFF/cgprofile-obj.s +++ b/lld/test/COFF/cgprofile-obj.s @@ -46,7 +46,7 @@ Aa: # NO-CG: 140001002 T B # NO-CG: 140001003 T A -# CG-OBJ-OF: 140001000 T C -# CG-OBJ-OF: 140001001 t D -# CG-OBJ-OF: 140001002 T A -# CG-OBJ-OF: 140001003 T B +# CG-OBJ-OF: 140001000 t D +# CG-OBJ-OF: 140001001 T A +# CG-OBJ-OF: 140001004 T C +# CG-OBJ-OF: 140001005 T B \ No newline at end of file diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s index 82c5d035540a9..14016658707af 100644 --- a/lld/test/ELF/cgprofile-obj.s +++ b/lld/test/ELF/cgprofile-obj.s @@ -49,7 +49,7 @@ Aa: # NO-CG: 0000000000201122 T B # NO-CG: 0000000000201123 T A -# CG-OBJ-OF: 0000000000201121 t D -# CG-OBJ-OF: 0000000000201120 T C -# CG-OBJ-OF: 0000000000201123 T B -# CG-OBJ-OF: 0000000000201122 T A +# CG-OBJ-OF: 0000000000201120 t D +# CG-OBJ-OF: 0000000000201124 T C +# CG-OBJ-OF: 0000000000201125 T B +# CG-OBJ-OF: 0000000000201121 T A From 96eced624e0f120155256033fdcb8342e7e58d6e Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Sun, 5 Jan 2025 10:50:52 +0800 Subject: [PATCH 179/480] [Clang] Implement CWG2369 "Ordering between constraints and substitution" (#102857) This patch partially implements CWG2369 for non-lambda-constrained functions. Lambdas are left intact at this point because we need extra work to correctly instantiate captures before the function instantiation. As a premise of CWG2369, this patch also implements CWG2770 to ensure the function parameters are instantiated on demand. Closes https://github.com/llvm/llvm-project/issues/54440 --- clang/include/clang/Sema/Sema.h | 22 +++- clang/include/clang/Sema/Template.h | 6 + clang/lib/Sema/SemaConcept.cpp | 47 ++++++- clang/lib/Sema/SemaTemplateDeduction.cpp | 49 +++++--- clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 8 +- clang/lib/Sema/SemaTemplateInstantiate.cpp | 115 ++++++++++++++++-- clang/lib/Sema/TreeTransform.h | 2 +- clang/test/CXX/drs/cwg23xx.cpp | 32 +++++ clang/test/CXX/drs/cwg26xx.cpp | 2 +- clang/test/CXX/drs/cwg27xx.cpp | 20 +++ .../expr.prim.req/nested-requirement.cpp | 2 +- .../constrant-satisfaction-conversions.cpp | 6 +- .../SemaCXX/concept-crash-on-diagnostic.cpp | 2 +- clang/test/SemaCXX/cxx20-ctad-type-alias.cpp | 2 +- clang/test/SemaCXX/cxx23-assume.cpp | 6 +- clang/test/SemaCXX/cxx2c-fold-exprs.cpp | 2 +- clang/test/SemaCXX/lambda-unevaluated.cpp | 4 +- .../SemaTemplate/concepts-recursive-inst.cpp | 4 +- .../SemaTemplate/cxx2a-constraint-exprs.cpp | 2 +- clang/test/SemaTemplate/deduction-guide.cpp | 5 - .../nested-implicit-deduction-guides.cpp | 8 +- clang/www/cxx_dr_status.html | 4 +- 22 files changed, 288 insertions(+), 62 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 5ee7ea48cc983..af59b7f38c71a 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13055,6 +13055,7 @@ class Sema final : public SemaBase { /// /// \param SkipForSpecialization when specified, any template specializations /// in a traversal would be ignored. + /// /// \param ForDefaultArgumentSubstitution indicates we should continue looking /// when encountering a specialized member function template, rather than /// returning immediately. @@ -13066,6 +13067,17 @@ class Sema final : public SemaBase { bool SkipForSpecialization = false, bool ForDefaultArgumentSubstitution = false); + /// Apart from storing the result to \p Result, this behaves the same as + /// another overload. + void getTemplateInstantiationArgs( + MultiLevelTemplateArgumentList &Result, const NamedDecl *D, + const DeclContext *DC = nullptr, bool Final = false, + std::optional> Innermost = std::nullopt, + bool RelativeToPrimary = false, const FunctionDecl *Pattern = nullptr, + bool ForConstraintInstantiation = false, + bool SkipForSpecialization = false, + bool ForDefaultArgumentSubstitution = false); + /// RAII object to handle the state changes required to synthesize /// a function body. class SynthesizedFunctionScope { @@ -13335,7 +13347,7 @@ class Sema final : public SemaBase { ExprResult SubstConstraintExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs); - // Unlike the above, this does not evaluates constraints. + // Unlike the above, this does not evaluate constraints. ExprResult SubstConstraintExprWithoutSatisfaction( Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs); @@ -14456,10 +14468,10 @@ class Sema final : public SemaBase { const MultiLevelTemplateArgumentList &TemplateArgs, SourceRange TemplateIDRange); - bool CheckInstantiatedFunctionTemplateConstraints( - SourceLocation PointOfInstantiation, FunctionDecl *Decl, - ArrayRef TemplateArgs, - ConstraintSatisfaction &Satisfaction); + bool CheckFunctionTemplateConstraints(SourceLocation PointOfInstantiation, + FunctionDecl *Decl, + ArrayRef TemplateArgs, + ConstraintSatisfaction &Satisfaction); /// \brief Emit diagnostics explaining why a constraint expression was deemed /// unsatisfied. diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h index 9800f75f676aa..59a0575ca9803 100644 --- a/clang/include/clang/Sema/Template.h +++ b/clang/include/clang/Sema/Template.h @@ -522,6 +522,12 @@ enum class TemplateSubstitutionKind : char { llvm::PointerUnion * findInstantiationOf(const Decl *D); + /// Similar to \p findInstantiationOf(), but it wouldn't assert if the + /// instantiation was not found within the current instantiation scope. This + /// is helpful for on-demand declaration instantiation. + llvm::PointerUnion * + findInstantiationUnsafe(const Decl *D); + void InstantiatedLocal(const Decl *D, Decl *Inst); void InstantiatedLocalPackArg(const Decl *D, VarDecl *Inst); void MakeInstantiatedLocalArgPack(const Decl *D); diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 539de00bd104f..10f4920a761f3 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -846,7 +846,7 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD, bool ForOverloadResolution) { // Don't check constraints if the function is dependent. Also don't check if // this is a function template specialization, as the call to - // CheckinstantiatedFunctionTemplateConstraints after this will check it + // CheckFunctionTemplateConstraints after this will check it // better. if (FD->isDependentContext() || FD->getTemplatedKind() == @@ -1111,12 +1111,55 @@ bool Sema::EnsureTemplateArgumentListConstraints( return false; } -bool Sema::CheckInstantiatedFunctionTemplateConstraints( +static bool CheckFunctionConstraintsWithoutInstantiation( + Sema &SemaRef, SourceLocation PointOfInstantiation, + FunctionTemplateDecl *Template, ArrayRef TemplateArgs, + ConstraintSatisfaction &Satisfaction) { + SmallVector TemplateAC; + Template->getAssociatedConstraints(TemplateAC); + if (TemplateAC.empty()) { + Satisfaction.IsSatisfied = true; + return false; + } + + LocalInstantiationScope Scope(SemaRef); + + FunctionDecl *FD = Template->getTemplatedDecl(); + // Collect the list of template arguments relative to the 'primary' + // template. We need the entire list, since the constraint is completely + // uninstantiated at this point. + + // FIXME: Add TemplateArgs through the 'Innermost' parameter once + // the refactoring of getTemplateInstantiationArgs() relands. + MultiLevelTemplateArgumentList MLTAL; + MLTAL.addOuterTemplateArguments(Template, std::nullopt, /*Final=*/false); + SemaRef.getTemplateInstantiationArgs( + MLTAL, /*D=*/FD, FD, + /*Final=*/false, /*Innermost=*/std::nullopt, /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true); + MLTAL.replaceInnermostTemplateArguments(Template, TemplateArgs); + + Sema::ContextRAII SavedContext(SemaRef, FD); + std::optional ThisScope; + if (auto *Method = dyn_cast(FD)) + ThisScope.emplace(SemaRef, /*Record=*/Method->getParent(), + /*ThisQuals=*/Method->getMethodQualifiers()); + return SemaRef.CheckConstraintSatisfaction( + Template, TemplateAC, MLTAL, PointOfInstantiation, Satisfaction); +} + +bool Sema::CheckFunctionTemplateConstraints( SourceLocation PointOfInstantiation, FunctionDecl *Decl, ArrayRef TemplateArgs, ConstraintSatisfaction &Satisfaction) { // In most cases we're not going to have constraints, so check for that first. FunctionTemplateDecl *Template = Decl->getPrimaryTemplate(); + + if (!Template) + return ::CheckFunctionConstraintsWithoutInstantiation( + *this, PointOfInstantiation, Decl->getDescribedFunctionTemplate(), + TemplateArgs, Satisfaction); + // Note - code synthesis context for the constraints check is created // inside CheckConstraintsSatisfaction. SmallVector TemplateAC; diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 1c1f6e30ab7b8..acd1151184e42 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3936,18 +3936,6 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( Result != TemplateDeductionResult::Success) return Result; - // C++ [temp.deduct.call]p10: [DR1391] - // If deduction succeeds for all parameters that contain - // template-parameters that participate in template argument deduction, - // and all template arguments are explicitly specified, deduced, or - // obtained from default template arguments, remaining parameters are then - // compared with the corresponding arguments. For each remaining parameter - // P with a type that was non-dependent before substitution of any - // explicitly-specified template arguments, if the corresponding argument - // A cannot be implicitly converted to P, deduction fails. - if (CheckNonDependent()) - return TemplateDeductionResult::NonDependentConversionFailure; - // Form the template argument list from the deduced template arguments. TemplateArgumentList *SugaredDeducedArgumentList = TemplateArgumentList::CreateCopy(Context, SugaredBuilder); @@ -3977,6 +3965,39 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( FD = const_cast(FDFriend); Owner = FD->getLexicalDeclContext(); } + // C++20 [temp.deduct.general]p5: [CWG2369] + // If the function template has associated constraints, those constraints + // are checked for satisfaction. If the constraints are not satisfied, type + // deduction fails. + // + // FIXME: We haven't implemented CWG2369 for lambdas yet, because we need + // to figure out how to instantiate lambda captures to the scope without + // first instantiating the lambda. + bool IsLambda = isLambdaCallOperator(FD) || isLambdaConversionOperator(FD); + if (!IsLambda && !IsIncomplete) { + if (CheckFunctionTemplateConstraints( + Info.getLocation(), + FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(), + CanonicalBuilder, Info.AssociatedConstraintsSatisfaction)) + return TemplateDeductionResult::MiscellaneousDeductionFailure; + if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) { + Info.reset(Info.takeSugared(), + TemplateArgumentList::CreateCopy(Context, CanonicalBuilder)); + return TemplateDeductionResult::ConstraintsNotSatisfied; + } + } + // C++ [temp.deduct.call]p10: [CWG1391] + // If deduction succeeds for all parameters that contain + // template-parameters that participate in template argument deduction, + // and all template arguments are explicitly specified, deduced, or + // obtained from default template arguments, remaining parameters are then + // compared with the corresponding arguments. For each remaining parameter + // P with a type that was non-dependent before substitution of any + // explicitly-specified template arguments, if the corresponding argument + // A cannot be implicitly converted to P, deduction fails. + if (CheckNonDependent()) + return TemplateDeductionResult::NonDependentConversionFailure; + MultiLevelTemplateArgumentList SubstArgs( FunctionTemplate, CanonicalDeducedArgumentList->asArray(), /*Final=*/false); @@ -4011,8 +4032,8 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( // ([temp.constr.decl]), those constraints are checked for satisfaction // ([temp.constr.constr]). If the constraints are not satisfied, type // deduction fails. - if (!IsIncomplete) { - if (CheckInstantiatedFunctionTemplateConstraints( + if (IsLambda && !IsIncomplete) { + if (CheckFunctionTemplateConstraints( Info.getLocation(), Specialization, CanonicalBuilder, Info.AssociatedConstraintsSatisfaction)) return TemplateDeductionResult::MiscellaneousDeductionFailure; diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index d42c3765aa534..5d6c11a75303d 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -902,10 +902,12 @@ Expr *buildIsDeducibleConstraint(Sema &SemaRef, Context.getTrivialTypeSourceInfo( Context.getDeducedTemplateSpecializationType( TemplateName(AliasTemplate), /*DeducedType=*/QualType(), - /*IsDependent=*/true)), // template specialization type whose - // arguments will be deduced. + /*IsDependent=*/true), + AliasTemplate->getLocation()), // template specialization type whose + // arguments will be deduced. Context.getTrivialTypeSourceInfo( - ReturnType), // type from which template arguments are deduced. + ReturnType, AliasTemplate->getLocation()), // type from which template + // arguments are deduced. }; return TypeTraitExpr::Create( Context, Context.getLogicalOperationType(), AliasTemplate->getLocation(), diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index fb0f38df62a74..cab9ae79ce5cb 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -475,6 +475,21 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( assert((ND || DC) && "Can't find arguments for a decl if one isn't provided"); // Accumulate the set of template argument lists in this structure. MultiLevelTemplateArgumentList Result; + getTemplateInstantiationArgs( + Result, ND, DC, Final, Innermost, RelativeToPrimary, Pattern, + ForConstraintInstantiation, SkipForSpecialization, + ForDefaultArgumentSubstitution); + return Result; +} + +void Sema::getTemplateInstantiationArgs( + MultiLevelTemplateArgumentList &Result, const NamedDecl *ND, + const DeclContext *DC, bool Final, + std::optional> Innermost, bool RelativeToPrimary, + const FunctionDecl *Pattern, bool ForConstraintInstantiation, + bool SkipForSpecialization, bool ForDefaultArgumentSubstitution) { + assert((ND || DC) && "Can't find arguments for a decl if one isn't provided"); + // Accumulate the set of template argument lists in this structure. using namespace TemplateInstArgsHelpers; const Decl *CurDecl = ND; @@ -535,14 +550,12 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( } if (R.IsDone) - return Result; + return; if (R.ClearRelativeToPrimary) RelativeToPrimary = false; assert(R.NextDecl); CurDecl = R.NextDecl; } - - return Result; } bool Sema::CodeSynthesisContext::isInstantiationRecord() const { @@ -1349,6 +1362,19 @@ namespace { // Whether an incomplete substituion should be treated as an error. bool BailOutOnIncomplete; + private: + bool isSubstitutingConstraints() const { + return llvm::any_of(SemaRef.CodeSynthesisContexts, [](auto &Context) { + return Context.Kind == + Sema::CodeSynthesisContext::ConstraintSubstitution; + }); + } + + // CWG2770: Function parameters should be instantiated when they are + // needed by a satisfaction check of an atomic constraint or + // (recursively) by another function parameter. + bool maybeInstantiateFunctionParameterToScope(ParmVarDecl *OldParm); + public: typedef TreeTransform inherited; @@ -1405,12 +1431,19 @@ namespace { ArrayRef Unexpanded, bool &ShouldExpand, bool &RetainExpansion, std::optional &NumExpansions) { - return getSema().CheckParameterPacksForExpansion(EllipsisLoc, - PatternRange, Unexpanded, - TemplateArgs, - ShouldExpand, - RetainExpansion, - NumExpansions); + if (SemaRef.CurrentInstantiationScope && isSubstitutingConstraints()) { + for (UnexpandedParameterPack ParmPack : Unexpanded) { + NamedDecl *VD = ParmPack.first.dyn_cast(); + if (!isa_and_present(VD)) + continue; + if (maybeInstantiateFunctionParameterToScope(cast(VD))) + return true; + } + } + + return getSema().CheckParameterPacksForExpansion( + EllipsisLoc, PatternRange, Unexpanded, TemplateArgs, ShouldExpand, + RetainExpansion, NumExpansions); } void ExpandingFunctionParameterPack(ParmVarDecl *Pack) { @@ -1911,9 +1944,62 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) { // template parameter. } + if (SemaRef.CurrentInstantiationScope) { + if (isSubstitutingConstraints() && isa(D) && + maybeInstantiateFunctionParameterToScope(cast(D))) + return nullptr; + } + return SemaRef.FindInstantiatedDecl(Loc, cast(D), TemplateArgs); } +bool TemplateInstantiator::maybeInstantiateFunctionParameterToScope( + ParmVarDecl *OldParm) { + if (SemaRef.CurrentInstantiationScope->findInstantiationUnsafe(OldParm)) + return false; + // We're instantiating a function parameter whose associated function template + // has not been instantiated at this point for constraint evaluation, so make + // sure the instantiated parameters are owned by a function declaration such + // that they can be correctly 'captured' in tryCaptureVariable(). + Sema::ContextRAII Context(SemaRef, OldParm->getDeclContext()); + + if (!OldParm->isParameterPack()) + return !TransformFunctionTypeParam(OldParm, /*indexAdjustment=*/0, + /*NumExpansions=*/std::nullopt, + /*ExpectParameterPack=*/false); + + SmallVector Unexpanded; + + // Find the parameter packs that could be expanded. + TypeLoc TL = OldParm->getTypeSourceInfo()->getTypeLoc(); + PackExpansionTypeLoc ExpansionTL = TL.castAs(); + TypeLoc Pattern = ExpansionTL.getPatternLoc(); + SemaRef.collectUnexpandedParameterPacks(Pattern, Unexpanded); + assert(!Unexpanded.empty() && "Pack expansion without parameter packs?"); + + bool ShouldExpand = false; + bool RetainExpansion = false; + std::optional OrigNumExpansions = + ExpansionTL.getTypePtr()->getNumExpansions(); + std::optional NumExpansions = OrigNumExpansions; + if (TryExpandParameterPacks(ExpansionTL.getEllipsisLoc(), + Pattern.getSourceRange(), Unexpanded, + ShouldExpand, RetainExpansion, NumExpansions)) + return true; + + assert(ShouldExpand && !RetainExpansion && + "Shouldn't preserve pack expansion when evaluating constraints"); + ExpandingFunctionParameterPack(OldParm); + for (unsigned I = 0; I != *NumExpansions; ++I) { + Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + if (!TransformFunctionTypeParam(OldParm, /*indexAdjustment=*/0, + /*NumExpansions=*/OrigNumExpansions, + /*ExpectParameterPack=*/false)) + return true; + } + return false; +} + Decl *TemplateInstantiator::TransformDefinition(SourceLocation Loc, Decl *D) { Decl *Inst = getSema().SubstDecl(D, getSema().CurContext, TemplateArgs); if (!Inst) @@ -4591,9 +4677,8 @@ static const Decl *getCanonicalParmVarDecl(const Decl *D) { return D; } - llvm::PointerUnion * -LocalInstantiationScope::findInstantiationOf(const Decl *D) { +LocalInstantiationScope::findInstantiationUnsafe(const Decl *D) { D = getCanonicalParmVarDecl(D); for (LocalInstantiationScope *Current = this; Current; Current = Current->Outer) { @@ -4618,6 +4703,14 @@ LocalInstantiationScope::findInstantiationOf(const Decl *D) { break; } + return nullptr; +} + +llvm::PointerUnion * +LocalInstantiationScope::findInstantiationOf(const Decl *D) { + auto *Result = findInstantiationUnsafe(D); + if (Result) + return Result; // If we're performing a partial substitution during template argument // deduction, we may not have values for template parameters yet. if (isa(D) || isa(D) || diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 5d43d98ce49e4..0121be81bc6ac 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -713,7 +713,7 @@ class TreeTransform { /// variables vector are acceptable. /// /// LastParamTransformed, if non-null, will be set to the index of the last - /// parameter on which transfromation was started. In the event of an error, + /// parameter on which transformation was started. In the event of an error, /// this will contain the parameter which failed to instantiate. /// /// Return true on error. diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp index 7f57d237526bc..f4992c1426844 100644 --- a/clang/test/CXX/drs/cwg23xx.cpp +++ b/clang/test/CXX/drs/cwg23xx.cpp @@ -528,3 +528,35 @@ namespace cwg2397 { // cwg2397: 17 } // namespace cwg2397 #endif + +#if __cplusplus >= 202002L + +namespace cwg2369 { // cwg2369: partial + +template struct Z { + typedef typename T::x xx; +}; + +template +concept C = requires { typename T::A; }; +template typename Z::xx f(void *, T); // #1 +template void f(int, T); // #2 + +struct A { +} a; + +struct ZZ { + template ::xx> operator T *(); + operator int(); +}; + +void foo() { + ZZ zz; + f(1, a); // OK, deduction fails for #1 because there is no conversion from int + // to void* + f(zz, 42); // OK, deduction fails for #1 because C is not satisfied +} + +} // namespace cwg2369 + +#endif diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp index 63a954c803b77..35814ca9a26a3 100644 --- a/clang/test/CXX/drs/cwg26xx.cpp +++ b/clang/test/CXX/drs/cwg26xx.cpp @@ -319,7 +319,7 @@ void f(T) requires requires { []() { T::invalid; } (); }; // since-cxx20-note@-3 {{in instantiation of requirement here}} // since-cxx20-note@-4 {{while substituting template arguments into constraint expression here}} // since-cxx20-note@#cwg2672-f-0 {{while checking constraint satisfaction for template 'f' required here}} -// since-cxx20-note@#cwg2672-f-0 {{in instantiation of function template specialization 'cwg2672::f' requested here}} +// since-cxx20-note@#cwg2672-f-0 {{while substituting deduced template arguments into function template 'f' [with T = int]}} void f(...); template diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp index fb5c8b1d1fbf8..b0adf5690acc6 100644 --- a/clang/test/CXX/drs/cwg27xx.cpp +++ b/clang/test/CXX/drs/cwg27xx.cpp @@ -174,6 +174,26 @@ static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3) #endif } // namespace cwg2759 +#if __cplusplus >= 202002L +namespace cwg2770 { // cwg2770: 20 +template +struct B { + static_assert(sizeof(T) == 1); + using type = int; +}; + +template +int f(T t, typename B::type u) requires (sizeof(t) == 1); + +template +int f(T t, long); + +int i = f(1, 2); +int j = f('a', 2); + +} // namespace cwg2770 +#endif + namespace cwg2789 { // cwg2789: 18 #if __cplusplus >= 202302L template diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp index 763d983d20f61..a23f7dc595171 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp @@ -154,7 +154,7 @@ void func() { bar(); // expected-note@-1 {{while checking constraint satisfaction for template 'bar' required here}} \ - // expected-note@-1 {{in instantiation of function template specialization}} + // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} // expected-note@#bar {{in instantiation of static data member}} // expected-note@#bar {{in instantiation of requirement here}} // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}} diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp index ba8e2dc372e98..c41de77986bca 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp @@ -11,7 +11,7 @@ template struct S { // expected-error@+3{{atomic constraint must be of type 'bool' (found 'S')}} // expected-note@#FINST{{while checking constraint satisfaction}} -// expected-note@#FINST{{in instantiation of function template specialization}} +// expected-note@#FINST{{while substituting deduced template arguments into function template 'f' [with T = int]}} template requires (S{}) void f(T); void f(int); @@ -19,7 +19,7 @@ void f(int); // Ensure this applies to operator && as well. // expected-error@+3{{atomic constraint must be of type 'bool' (found 'S')}} // expected-note@#F2INST{{while checking constraint satisfaction}} -// expected-note@#F2INST{{in instantiation of function template specialization}} +// expected-note@#F2INST{{while substituting deduced template arguments into function template 'f2' [with T = int]}} template requires (S{} && true) void f2(T); void f2(int); @@ -32,7 +32,7 @@ template requires requires { // expected-note@-4{{while checking the satisfaction}} // expected-note@-6{{while substituting template arguments}} // expected-note@#F3INST{{while checking constraint satisfaction}} - // expected-note@#F3INST{{in instantiation of function template specialization}} + // expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}} // } void f3(T); diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp index 71e55c8290ee4..ccc109cbca0f1 100644 --- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp +++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp @@ -31,7 +31,7 @@ void function() { // expected-note@#3 {{checking the satisfaction of concept 'convertible_to'}} // expected-note@#2 {{substituting template arguments into constraint expression here}} // expected-note@#5 {{checking constraint satisfaction for template 'compare'}} -// expected-note@#5 {{in instantiation of function template specialization 'compare' requested here}} +// expected-note@#5 {{while substituting deduced template arguments into function template 'compare' [with IteratorL = Object *, IteratorR = Object *]}} // expected-note@#4 {{candidate template ignored: constraints not satisfied [with IteratorL = Object *, IteratorR = Object *]}} // We don't know exactly the substituted type for `lhs == rhs`, thus a placeholder 'expr-type' is emitted. diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index 2d43e46b9e3d7..23c898e6379b0 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -196,7 +196,7 @@ struct Foo { template using Bar = Foo; // expected-note {{constraints not satisfied for class template 'Foo'}} -// expected-note@-1 {{candidate template ignored: could not match}} +// expected-note@-1 {{candidate template ignored: could not match}} expected-note@-1 {{candidate template ignored: constraints not satisfied}} // expected-note@-2 {{implicit deduction guide declared as 'template requires __is_deducible(test14::Bar, Foo) Bar(Foo) -> Foo'}} // expected-note@-3 {{implicit deduction guide declared as 'template requires __is_deducible(test14::Bar, Foo) Bar(const double (&)[K]) -> Foo'}} double abc[3]; diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp index 7f80cdfe7d452..726cb3bff652e 100644 --- a/clang/test/SemaCXX/cxx23-assume.cpp +++ b/clang/test/SemaCXX/cxx23-assume.cpp @@ -129,12 +129,12 @@ constexpr int f5() requires (!C) { return 2; } // expected-note 4 {{while che static_assert(f5() == 1); static_assert(f5() == 1); // expected-note 3 {{while checking constraint satisfaction}} - // expected-note@-1 3 {{in instantiation of}} + // expected-note@-1 3 {{while substituting deduced template arguments}} // expected-error@-2 {{no matching function for call}} static_assert(f5() == 2); -static_assert(f5() == 1); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}} -static_assert(f5() == 2); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}} +static_assert(f5() == 1); // expected-note {{while checking constraint satisfaction}} expected-note {{while substituting deduced template arguments}} +static_assert(f5() == 2); // expected-note {{while checking constraint satisfaction}} expected-note {{while substituting deduced template arguments}} // Do not validate assumptions whose evaluation would have side-effects. constexpr int foo() { diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp index 48061439941f2..4220486d3aed3 100644 --- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp +++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp @@ -233,7 +233,7 @@ void g() { A *ap; f(ap, ap); // expected-error{{no matching function for call to 'f'}} \ // expected-note {{while checking constraint satisfaction}} \ - // expected-note {{in instantiation of function template specialization}} + // expected-note {{while substituting deduced template arguments}} } } diff --git a/clang/test/SemaCXX/lambda-unevaluated.cpp b/clang/test/SemaCXX/lambda-unevaluated.cpp index a9bcab58464e2..d3f937281f201 100644 --- a/clang/test/SemaCXX/lambda-unevaluated.cpp +++ b/clang/test/SemaCXX/lambda-unevaluated.cpp @@ -174,7 +174,7 @@ int* func(T) requires requires { []() { T::foo(); }; }; // expected-error{{type double* func(...); static_assert(__is_same(decltype(func(0)), double*)); // expected-note {{while checking constraint satisfaction for template 'func' required here}} - // expected-note@-1 {{in instantiation of function template specialization 'lambda_in_constraints::func'}} + // expected-note@-1 {{while substituting deduced template arguments into function template 'func' [with T = int]}} static_assert(__is_same(decltype(func(WithFoo())), int*)); template @@ -252,7 +252,7 @@ S s("a"); // #use // expected-note@#S-requires {{substituting template arguments into constraint expression here}} // expected-note@#S-requires {{in instantiation of requirement here}} // expected-note@#use {{checking constraint satisfaction for template 'S' required here}} -// expected-note@#use {{requested here}} +// expected-note@#use {{while substituting deduced template arguments into function template 'S' [with value:auto = const char *]}} // expected-note-re@#S 2{{candidate constructor {{.*}} not viable}} // expected-note@#S-ctor {{constraints not satisfied}} // expected-note-re@#S-requires {{because {{.*}} would be invalid}} diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp index 9330df8cdd039..30a410cef91ee 100644 --- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp +++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp @@ -76,7 +76,7 @@ auto it = begin(rng); // #BEGIN_CALL // expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf' requested here}} // expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}} // expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin' required here}} -// expected-note@#BEGIN_CALL {{in instantiation of function template specialization}} +// expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}} // Fallout of the failure is failed lookup, which is necessary to stop odd // cascading errors. @@ -103,7 +103,7 @@ namespace GH50891 { // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric' requested here}} // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}} // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}} - // expected-note@#FOO_CALL {{in instantiation of function template specialization}} + // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}} // expected-note@#FOO_CALL {{in instantiation of requirement here}} // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}} diff --git a/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp b/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp index f4403587a6259..5809ef684bbf3 100644 --- a/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp +++ b/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp @@ -34,7 +34,7 @@ namespace constant_evaluated { expected-note@-1{{candidate template ignored}} int a = (foo(), 0); // expected-note@-1 {{while checking}} expected-error@-1{{no matching function}} \ - expected-note@-1 {{in instantiation}} + expected-note@-1 {{while substituting}} template void bar() requires requires { requires f; } { }; // expected-note@-1{{in instantiation}} \ expected-note@-1{{while substituting}} \ diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp index d03c783313dd7..67d00bb49f77d 100644 --- a/clang/test/SemaTemplate/deduction-guide.cpp +++ b/clang/test/SemaTemplate/deduction-guide.cpp @@ -234,11 +234,6 @@ F s(0); // CHECK: | `-CXXBoolLiteralExpr {{.*}} 'bool' false // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit 'auto (U) -> F<>' // CHECK: | `-ParmVarDecl {{.*}} 'U' -// CHECK: `-CXXDeductionGuideDecl {{.*}} implicit 'auto (int) -> F<>' -// CHECK: |-TemplateArgument integral ''x'' -// CHECK: |-TemplateArgument type 'int' -// CHECK: | `-BuiltinType {{.*}} 'int' -// CHECK: `-ParmVarDecl {{.*}} 'int' // CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<>' dependent trailing_return cdecl // CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent // CHECK: | `-CXXRecord {{.*}} 'F' diff --git a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp index af3e3358f6138..5c7a90273d0e0 100644 --- a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp +++ b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp @@ -38,7 +38,7 @@ template concept True = true; template -concept False = false; +concept False = false; // #False template struct concepts { template struct B { @@ -68,7 +68,7 @@ template struct nested_init_list { Y y; }; - template + template // #INIT_LIST_INNER_INVALID_HEADER struct concept_fail { // #INIT_LIST_INNER_INVALID X x; F f; @@ -81,7 +81,9 @@ using NIL = nested_init_list::B; // expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'nested_init_list::concept_fail'}} nested_init_list::concept_fail nil_invalid{1, ""}; -// expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: substitution failure [with F = const char *]: constraints not satisfied for class template 'concept_fail' [with F = const char *]}} +// expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: constraints not satisfied [with F = const char *]}} +// expected-note@#INIT_LIST_INNER_INVALID_HEADER {{because 'const char *' does not satisfy 'False'}} +// expected-note@#False {{because 'false' evaluated to false}} // expected-note@#INIT_LIST_INNER_INVALID {{implicit deduction guide declared as 'template concept_fail(int, F) -> concept_fail'}} // expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 1 argument, but 2 were provided}} // expected-note@#INIT_LIST_INNER_INVALID {{implicit deduction guide declared as 'template concept_fail(concept_fail) -> concept_fail'}} diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index c069e155fd547..6f42d10e4c187 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -14045,7 +14045,7 @@

C++ defect report implementation status

2369 CD6 Ordering between constraints and substitution - Unknown + Partial 2370 @@ -16464,7 +16464,7 @@

C++ defect report implementation status

2770 open Trailing requires-clause can refer to function parameters before they are substituted into - Not resolved + Clang 20 2771 From 3092ebcd1e2746ee8db22bf3746b2ad2fb7534dd Mon Sep 17 00:00:00 2001 From: Haohai Wen Date: Sun, 5 Jan 2025 11:04:05 +0800 Subject: [PATCH 180/480] [LLD] Add CHECK-NEXT for cgprofile-obj.s tests (#121677) --- lld/test/COFF/cgprofile-obj.s | 31 +++++++++++++++++-------------- lld/test/ELF/cgprofile-obj.s | 31 +++++++++++++++++-------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/lld/test/COFF/cgprofile-obj.s b/lld/test/COFF/cgprofile-obj.s index 756279a8b5759..c16aa2e198dfb 100644 --- a/lld/test/COFF/cgprofile-obj.s +++ b/lld/test/COFF/cgprofile-obj.s @@ -36,17 +36,20 @@ Aa: .cg_profile B, C, 30 .cg_profile C, D, 90 -# CG-OBJ: 140001000 T A -# CG-OBJ: 140001001 T B -# CG-OBJ: 140001002 T C -# CG-OBJ: 140001003 t D - -# NO-CG: 140001000 t D -# NO-CG: 140001001 T C -# NO-CG: 140001002 T B -# NO-CG: 140001003 T A - -# CG-OBJ-OF: 140001000 t D -# CG-OBJ-OF: 140001001 T A -# CG-OBJ-OF: 140001004 T C -# CG-OBJ-OF: 140001005 T B \ No newline at end of file +# CG-OBJ: 140001000 T A +# CG-OBJ-NEXT: 140001000 t Aa +# CG-OBJ-NEXT: 140001001 T B +# CG-OBJ-NEXT: 140001002 T C +# CG-OBJ-NEXT: 140001003 t D + +# NO-CG: 140001000 t D +# NO-CG-NEXT: 140001001 T C +# NO-CG-NEXT: 140001002 T B +# NO-CG-NEXT: 140001003 T A +# NO-CG-NEXT: 140001003 t Aa + +# CG-OBJ-OF: 140001000 t D +# CG-OBJ-OF-NEXT: 140001001 T A +# CG-OBJ-OF-NEXT: 140001001 t Aa +# CG-OBJ-OF-NEXT: 140001004 T C +# CG-OBJ-OF-NEXT: 140001005 T B diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s index 14016658707af..358dcd9fadaa1 100644 --- a/lld/test/ELF/cgprofile-obj.s +++ b/lld/test/ELF/cgprofile-obj.s @@ -39,17 +39,20 @@ Aa: .cg_profile B, C, 30 .cg_profile C, D, 90 -# CG-OBJ: 0000000000201123 t D -# CG-OBJ: 0000000000201122 T C -# CG-OBJ: 0000000000201121 T B -# CG-OBJ: 0000000000201120 T A - -# NO-CG: 0000000000201120 t D -# NO-CG: 0000000000201121 T C -# NO-CG: 0000000000201122 T B -# NO-CG: 0000000000201123 T A - -# CG-OBJ-OF: 0000000000201120 t D -# CG-OBJ-OF: 0000000000201124 T C -# CG-OBJ-OF: 0000000000201125 T B -# CG-OBJ-OF: 0000000000201121 T A +# CG-OBJ: 0000000000201123 t D +# CG-OBJ-NEXT: 0000000000201120 t Aa +# CG-OBJ-NEXT: 0000000000201122 T C +# CG-OBJ-NEXT: 0000000000201121 T B +# CG-OBJ-NEXT: 0000000000201120 T A + +# NO-CG: 0000000000201120 t D +# NO-CG-NEXT: 0000000000201123 t Aa +# NO-CG-NEXT: 0000000000201121 T C +# NO-CG-NEXT: 0000000000201122 T B +# NO-CG-NEXT: 0000000000201123 T A + +# CG-OBJ-OF: 0000000000201120 t D +# CG-OBJ-OF-NEXT: 0000000000201121 t Aa +# CG-OBJ-OF-NEXT: 0000000000201124 T C +# CG-OBJ-OF-NEXT: 0000000000201125 T B +# CG-OBJ-OF-NEXT: 0000000000201121 T A From 7700695739d078eff01aad6f4d40c933419d08bc Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sun, 5 Jan 2025 11:41:56 +0800 Subject: [PATCH 181/480] [VPlan] Fix crash with EVL tail folding intrinsic with no corresponding VP (#121542) This fixes a crash when building SPEC CPU 2017 with EVL tail folding when widening @llvm.log10 intrinsics. @llvm.log10 and some other intrinsics don't have a corresponding VP intrinsic, so this fixes the crash by removing the assert and bailing instead. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 7 ++- ...ize-force-tail-with-evl-call-intrinsics.ll | 56 +++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8ac2bd5160c26..c5a73021ca8cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1498,10 +1498,13 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, auto *CastR = cast(CR); VPID = VPIntrinsic::getForOpcode(CastR->getOpcode()); } - assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic"); + + // Not all intrinsics have a corresponding VP intrinsic. + if (VPID == Intrinsic::not_intrinsic) + return nullptr; assert(VPIntrinsic::getMaskParamPos(VPID) && VPIntrinsic::getVectorLengthParamPos(VPID) && - "Expected VP intrinsic"); + "Expected VP intrinsic to have mask and EVL"); SmallVector Ops(CR->operands()); Ops.push_back(&AllOneMask); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll index 11cf832c8abbf..f07aaecfa8467 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll @@ -989,6 +989,62 @@ exit: ret void } +; There's no @llvm.vp.log10, so don't transform it. +define void @log10(ptr %a, ptr %b, i64 %N) { +; IF-EVL-LABEL: define void @log10( +; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: br label %[[LOOP:.*]] +; IF-EVL: [[LOOP]]: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP]], align 4 +; IF-EVL-NEXT: [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]]) +; IF-EVL-NEXT: [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: store float [[COND]], ptr [[GEP9]], align 4 +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]] +; IF-EVL: [[EXIT]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @log10( +; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: br label %[[LOOP:.*]] +; NO-VP: [[LOOP]]: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP]], align 4 +; NO-VP-NEXT: [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]]) +; NO-VP-NEXT: [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: store float [[COND]], ptr [[GEP9]], align 4 +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]] +; NO-VP: [[EXIT]]: +; NO-VP-NEXT: ret void +; + +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds float, ptr %b, i64 %iv + %0 = load float, ptr %gep, align 4 + %cond = tail call float @llvm.log10.f32(float %0) + %gep9 = getelementptr inbounds float, ptr %a, i64 %iv + store float %cond, ptr %gep9, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + + declare i32 @llvm.smax.i32(i32, i32) declare i32 @llvm.smin.i32(i32, i32) declare i32 @llvm.umax.i32(i32, i32) From 743aee4951d452c7795e4e829a6cbf704340cd1c Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sun, 5 Jan 2025 13:05:21 +0900 Subject: [PATCH 182/480] [bazel] Fixup for #121043 --- .../llvm-project-overlay/clang/BUILD.bazel | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index e2babada50051..cb4f55a5ca924 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -366,6 +366,22 @@ gentbl( td_file = "include/clang/Basic/BuiltinsX86.td", td_srcs = [ "include/clang/Basic/BuiltinsX86.td", + "include/clang/Basic/BuiltinsX86Base.td", + "include/clang/Basic/BuiltinsBase.td", + ], +) + +gentbl( + name = "basic_builtins_x86_64_gen", + tbl_outs = [( + "-gen-clang-builtins", + "include/clang/Basic/BuiltinsX86_64.inc", + )], + tblgen = ":clang-tblgen", + td_file = "include/clang/Basic/BuiltinsX86_64.td", + td_srcs = [ + "include/clang/Basic/BuiltinsX86_64.td", + "include/clang/Basic/BuiltinsX86Base.td", "include/clang/Basic/BuiltinsBase.td", ], ) @@ -708,6 +724,7 @@ cc_library( ":basic_builtins_gen", ":basic_builtins_riscv_gen", ":basic_builtins_x86_gen", + ":basic_builtins_x86_64_gen", ":basic_internal_headers", ":basic_riscv_sifive_vector_builtins_gen", ":basic_riscv_vector_builtin_cg_gen", From 00934505d44e28f3c1d5739d6369648e87f10cf5 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 4 Jan 2025 20:51:00 -0800 Subject: [PATCH 183/480] [clang-format] Add `TT_CompoundRequirementLBrace` for better annotation (#121539) Also, add `ST_CompoundRequirement` to help annotating */&/&& in compound requirement expressions as `TT_BinaryOperator`. Fixes #121471. --- clang/lib/Format/FormatToken.h | 1 + clang/lib/Format/TokenAnnotator.cpp | 7 ++++++- clang/lib/Format/TokenAnnotator.h | 2 ++ clang/lib/Format/UnwrappedLineFormatter.cpp | 2 +- clang/lib/Format/UnwrappedLineParser.cpp | 5 +++-- clang/unittests/Format/TokenAnnotatorTest.cpp | 12 ++++++++++++ 6 files changed, 25 insertions(+), 4 deletions(-) diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 0fd3a49c71f9d..d97b6522f1fef 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -45,6 +45,7 @@ namespace format { TYPE(CastRParen) \ TYPE(ClassLBrace) \ TYPE(ClassRBrace) \ + TYPE(CompoundRequirementLBrace) \ /* ternary ?: expression */ \ TYPE(ConditionalExpr) \ /* the condition in an if statement */ \ diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index fad375733ef84..945174ca9c586 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -143,6 +143,8 @@ class AnnotatingParser { case TT_StructLBrace: case TT_UnionLBrace: return ST_Class; + case TT_CompoundRequirementLBrace: + return ST_CompoundRequirement; default: return ST_Other; } @@ -2076,7 +2078,7 @@ class AnnotatingParser { TT_RecordLBrace, TT_StructLBrace, TT_UnionLBrace, TT_RequiresClause, TT_RequiresClauseInARequiresExpression, TT_RequiresExpression, TT_RequiresExpressionLParen, TT_RequiresExpressionLBrace, - TT_BracedListLBrace)) { + TT_CompoundRequirementLBrace, TT_BracedListLBrace)) { CurrentToken->setType(TT_Unknown); } CurrentToken->Role.reset(); @@ -3100,6 +3102,9 @@ class AnnotatingParser { } } + if (!Scopes.empty() && Scopes.back() == ST_CompoundRequirement) + return TT_BinaryOperator; + return TT_PointerOrReference; } diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h index 9117ca3f9fb7b..1a250e94d97c5 100644 --- a/clang/lib/Format/TokenAnnotator.h +++ b/clang/lib/Format/TokenAnnotator.h @@ -40,6 +40,8 @@ enum ScopeType { ST_ChildBlock, // Contained in class declaration/definition. ST_Class, + // Contained in compound requirement. + ST_CompoundRequirement, // Contained within other scope block (function, loop, if/else, etc). ST_Other, }; diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 2fe4ebd4ff8eb..ec65fea6ec3df 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -535,7 +535,7 @@ class LineJoiner { // Try to merge records. if (TheLine->Last->is(TT_EnumLBrace)) { ShouldMerge = Style.AllowShortEnumsOnASingleLine; - } else if (TheLine->Last->is(TT_RequiresExpressionLBrace)) { + } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) { ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine; } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace)) { // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 39aa37af480c9..317717241c17c 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -392,7 +392,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace, break; case tok::l_brace: if (InRequiresExpression) { - FormatTok->setFinalizedType(TT_RequiresExpressionLBrace); + FormatTok->setFinalizedType(TT_CompoundRequirementLBrace); } else if (FormatTok->Previous && FormatTok->Previous->ClosesRequiresClause) { // We need the 'default' case here to correctly parse a function @@ -1705,7 +1705,8 @@ void UnwrappedLineParser::parseStructuralElement( } for (const bool InRequiresExpression = - OpeningBrace && OpeningBrace->is(TT_RequiresExpressionLBrace); + OpeningBrace && OpeningBrace->isOneOf(TT_RequiresExpressionLBrace, + TT_CompoundRequirementLBrace); !eof();) { if (IsCpp && FormatTok->isCppAlternativeOperatorKeyword()) { if (auto *Next = Tokens->peekNextToken(/*SkipComment=*/true); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 875feff3d5420..a5b2d09a9f704 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1456,6 +1456,18 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresExpressions) { EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_RequiresExpressionLBrace); } +TEST_F(TokenAnnotatorTest, CompoundRequirement) { + auto Tokens = annotate("template \n" + "concept CheckMultiplicableBy = requires(T a, V b) {\n" + " { a * b } -> std::same_as;\n" + "};"); + ASSERT_EQ(Tokens.size(), 36u) << Tokens; + + EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_RequiresExpressionLBrace); + EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace); + EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator); +} + TEST_F(TokenAnnotatorTest, UnderstandsPragmaRegion) { // Everything after #pragma region should be ImplicitStringLiteral auto Tokens = annotate("#pragma region Foo(Bar: Hello)"); From 267ab1cf574bd51d67f3f88c1c4f28a44fca2c8a Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 5 Jan 2025 10:29:31 +0400 Subject: [PATCH 184/480] [clang] Add test for CWG190 "Layout-compatible POD-struct types" (#121668) This patch adds test for [CWG190](https://cplusplus.github.io/CWG/issues/190.html), leveraging `__is_layout_compatible` intrinsic added for C++20 `std::is_layout_compatible`. I conservatively set the status to Clang 19, as it's the first release that implemented said intrinsic. --- clang/test/CXX/drs/cwg1xx.cpp | 21 +++++++++++++++++++-- clang/www/cxx_dr_status.html | 2 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index 6aec8b65c91f1..939de6dee06d3 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -1314,8 +1314,25 @@ namespace cwg188 { // cwg188: yes static_assert(sizeof(0, c) == 10, ""); } -// cwg190 FIXME: add codegen test for tbaa -// or implement C++20 std::is_layout_compatible and test it this way +namespace cwg190 { // cwg190: 19 +struct A { + int a; + static double x; + int b; + void y(); + int c; +}; + +struct B { + int a; + void y(); + int b; + static double x; + int c; +}; + +static_assert(__is_layout_compatible(A, B), ""); +} // namespace cwg190 int cwg191_j; namespace cwg191 { // cwg191: yes diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 6f42d10e4c187..335442c6e605f 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -1185,7 +1185,7 @@

C++ defect report implementation status

190 TC1 Layout-compatible POD-struct types - Unknown + Clang 19 191 From 6f69f8c9fe57a44939fd111f52b6d8e267a46602 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Jan 2025 22:33:02 -0800 Subject: [PATCH 185/480] [IR] Use Instruction::isBinaryOp to simplify code. NFC --- llvm/lib/IR/ConstantsContext.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index aaaab0b7a918a..08bf3f9dff5e6 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -491,8 +491,7 @@ struct ConstantExprKeyType { default: if (Instruction::isCast(Opcode)) return new CastConstantExpr(Opcode, Ops[0], Ty); - if ((Opcode >= Instruction::BinaryOpsBegin && - Opcode < Instruction::BinaryOpsEnd)) + if (Instruction::isBinaryOp(Opcode)) return new BinaryConstantExpr(Opcode, Ops[0], Ops[1], SubclassOptionalData); llvm_unreachable("Invalid ConstantExpr!"); From c983ae8f26334279cfe7846bdc9b395abe415e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= Date: Sun, 5 Jan 2025 08:18:05 +0100 Subject: [PATCH 186/480] [llvm-lib] Handle MIPS architecture (#121254) - add a test to check values for /machine argument - add a test to check if machine is correctly inferred from inputs --- llvm/lib/Object/WindowsMachineFlag.cpp | 2 ++ llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp | 3 +++ llvm/test/tools/llvm-lib/Inputs/mips.ll | 7 +++++++ llvm/test/tools/llvm-lib/infer-machine.test | 21 +++++++++++++++++++++ llvm/test/tools/llvm-lib/machine-opt.test | 13 +++++++++++++ 5 files changed, 46 insertions(+) create mode 100644 llvm/test/tools/llvm-lib/Inputs/mips.ll create mode 100644 llvm/test/tools/llvm-lib/infer-machine.test create mode 100644 llvm/test/tools/llvm-lib/machine-opt.test diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp index b9f818775768a..caf357e8c136f 100644 --- a/llvm/lib/Object/WindowsMachineFlag.cpp +++ b/llvm/lib/Object/WindowsMachineFlag.cpp @@ -21,6 +21,7 @@ using namespace llvm; // Returns /machine's value. COFF::MachineTypes llvm::getMachineType(StringRef S) { + // Flags must be a superset of Microsoft lib.exe /machine flags. return StringSwitch(S.lower()) .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64) .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386) @@ -28,6 +29,7 @@ COFF::MachineTypes llvm::getMachineType(StringRef S) { .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64) .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC) .Case("arm64x", COFF::IMAGE_FILE_MACHINE_ARM64X) + .Case("mips", COFF::IMAGE_FILE_MACHINE_R4000) .Default(COFF::IMAGE_FILE_MACHINE_UNKNOWN); } diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index 138d9fc7f1d7f..6ce06b434b2c0 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -171,6 +171,7 @@ static Expected getCOFFFileMachine(MemoryBufferRef MB) { uint16_t Machine = (*Obj)->getMachine(); if (Machine != COFF::IMAGE_FILE_MACHINE_I386 && Machine != COFF::IMAGE_FILE_MACHINE_AMD64 && + Machine != COFF::IMAGE_FILE_MACHINE_R4000 && Machine != COFF::IMAGE_FILE_MACHINE_ARMNT && !COFF::isAnyArm64(Machine)) { return createStringError(inconvertibleErrorCode(), "unknown machine: " + std::to_string(Machine)); @@ -195,6 +196,8 @@ static Expected getBitcodeFileMachine(MemoryBufferRef MB) { case Triple::aarch64: return T.isWindowsArm64EC() ? COFF::IMAGE_FILE_MACHINE_ARM64EC : COFF::IMAGE_FILE_MACHINE_ARM64; + case Triple::mipsel: + return COFF::IMAGE_FILE_MACHINE_R4000; default: return createStringError(inconvertibleErrorCode(), "unknown arch in target triple: " + *TripleStr); diff --git a/llvm/test/tools/llvm-lib/Inputs/mips.ll b/llvm/test/tools/llvm-lib/Inputs/mips.ll new file mode 100644 index 0000000000000..dd0f8338cfa97 --- /dev/null +++ b/llvm/test/tools/llvm-lib/Inputs/mips.ll @@ -0,0 +1,7 @@ +target triple = "mipsel-windows-coff" + +; Function Attrs: noinline nounwind optnone +define dso_local void @"?f@@YAXXZ"() #0 { +entry: + ret void +} diff --git a/llvm/test/tools/llvm-lib/infer-machine.test b/llvm/test/tools/llvm-lib/infer-machine.test new file mode 100644 index 0000000000000..23ecf256069e6 --- /dev/null +++ b/llvm/test/tools/llvm-lib/infer-machine.test @@ -0,0 +1,21 @@ +REQUIRES: mips-registered-target + +RUN: rm -rf %t && mkdir -p %t + +RUN: llc -mtriple=i386-windows-coff -filetype=obj -o %t/i386.obj %S/Inputs/i386.ll +RUN: llvm-as %S/Inputs/i386.ll -o %t/i386.bc +RUN: llvm-lib %t/i386.obj %t/i386.bc /out:%t/i386.lib +RUN: llvm-objdump -h %t/i386.lib | FileCheck %s --check-prefix=I386 +I386: file format coff-i386 + +RUN: llc -mtriple=x86_64-windows-coff -filetype=obj -o %t/x86_64.obj %S/Inputs/x86_64.ll +RUN: llvm-as %S/Inputs/x86_64.ll -o %t/x86_64.bc +RUN: llvm-lib %t/x86_64.obj %t/x86_64.bc /out:%t/x86_64.lib +RUN: llvm-objdump -h %t/x86_64.lib | FileCheck %s --check-prefix=X86_64 +X86_64: file format coff-x86-64 + +RUN: llc -mtriple=mipsel-windows-coff -filetype=obj -o %t/mips.obj %S/Inputs/mips.ll +RUN: llvm-as %S/Inputs/mips.ll -o %t/mips.bc +RUN: llvm-lib %t/mips.obj %t/mips.bc /out:%t/mips.lib +RUN: llvm-objdump -h %t/mips.lib | FileCheck %s --check-prefix=MIPS +MIPS: file format coff-mips diff --git a/llvm/test/tools/llvm-lib/machine-opt.test b/llvm/test/tools/llvm-lib/machine-opt.test new file mode 100644 index 0000000000000..e5ade82c2f0a6 --- /dev/null +++ b/llvm/test/tools/llvm-lib/machine-opt.test @@ -0,0 +1,13 @@ +RUN: rm -f %t.lib + +RUN: llvm-lib /out:%t.lib /machine:i386 2>&1 | FileCheck --check-prefix=EMPTYWARN %s +RUN: llvm-lib /out:%t.lib /machine:amd64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s + +RUN: llvm-lib /out:%t.lib /machine:mips 2>&1 | FileCheck --check-prefix=EMPTYWARN %s + +RUN: llvm-lib /out:%t.lib /machine:arm 2>&1 | FileCheck --check-prefix=EMPTYWARN %s +RUN: llvm-lib /out:%t.lib /machine:arm64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s +RUN: llvm-lib /out:%t.lib /machine:arm64x 2>&1 | FileCheck --check-prefix=EMPTYWARN %s + +EMPTYWARN: warning: no input files, not writing output file + From 8267bea9a35c3c3f866b942a50c2b98ac462ce35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= Date: Sun, 5 Jan 2025 08:20:16 +0100 Subject: [PATCH 187/480] [Clang][MIPS] Send correct architecture for MinGW toolchains (#121042) 'mipspe' name was chosen by binutils, when the project was able to create executables for Windows CE/MIPS. --- clang/lib/Driver/ToolChains/MinGW.cpp | 3 +++ clang/test/Driver/mingw.cpp | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp index 963de81027ca9..9f0c6160a309e 100644 --- a/clang/lib/Driver/ToolChains/MinGW.cpp +++ b/clang/lib/Driver/ToolChains/MinGW.cpp @@ -138,6 +138,9 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA, else CmdArgs.push_back("arm64pe"); break; + case llvm::Triple::mipsel: + CmdArgs.push_back("mipspe"); + break; default: D.Diag(diag::err_target_unknown_triple) << TC.getEffectiveTriple().str(); } diff --git a/clang/test/Driver/mingw.cpp b/clang/test/Driver/mingw.cpp index 9790c86a364f8..66da0c97f4166 100644 --- a/clang/test/Driver/mingw.cpp +++ b/clang/test/Driver/mingw.cpp @@ -85,6 +85,10 @@ // RUN: | FileCheck %s --check-prefix CHECK_MINGW_EC_LINK // CHECK_MINGW_EC_LINK: "-m" "arm64ecpe" +// RUN: %clang --target=mipsel-windows-gnu -### -o /dev/null %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix CHECK_MINGW_MIPSPE +// CHECK_MINGW_MIPSPE: "-m" "mipspe" + // RUN: %clang --target=i686-windows-gnu -fms-hotpatch -### -- %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=FUNCTIONPADMIN // FUNCTIONPADMIN: "--functionpadmin" From fbc198c548cf21bb2be29509a46913a57f95e610 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 5 Jan 2025 13:15:08 +0400 Subject: [PATCH 188/480] [clang] Add test for CWG192 "Name lookup in parameters" (#121679) This patch adds a rather simple test for [CWG192](https://cplusplus.github.io/CWG/issues/192.html). Parameter declarations of member functions are not complete-class contexts (unlike default arguments), so the example in the issue is ill-formed. Changes in [CWG1352](https://cplusplus.github.io/CWG/issues/1352.html) which resolved the issue, are superseded by the notion of complete-class context (https://eel.is/c++draft/class.mem#def:complete-class_context). --- clang/test/CXX/drs/cwg1xx.cpp | 8 ++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index 939de6dee06d3..9eeca4cb2a681 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -1364,6 +1364,14 @@ namespace cwg191 { // cwg191: yes } } +namespace cwg192 { // cwg192: 2.7 +struct S { + void f(I i) { } + // expected-error@-1 {{unknown type name 'I'}} + typedef int I; +}; +} // namespace cwg192 + // cwg193 is in cwg193.cpp namespace cwg194 { // cwg194: yes diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 335442c6e605f..239c05e782384 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -1197,7 +1197,7 @@

C++ defect report implementation status

192 NAD Name lookup in parameters - Unknown + Clang 2.7 193 From 3321c2d72ab7757dbdd38bdd99a76d89293dac8a Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 5 Jan 2025 13:17:15 +0400 Subject: [PATCH 189/480] [clang] Add test for CWG794 "Base-derived conversion in member type of pointer-to-member conversion" (#121660) This patch adds a test for [CWG794](https://cplusplus.github.io/CWG/issues/794.html), which is an NB comment closed as NAD. Author was asked to bring a paper to Evolution, which never happened. So we test for the absence of base-derived conversion in pointer-to-member conversion. --- clang/test/CXX/drs/cwg7xx.cpp | 12 ++++++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/cwg7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp index 507eb8fb71435..842d172d37900 100644 --- a/clang/test/CXX/drs/cwg7xx.cpp +++ b/clang/test/CXX/drs/cwg7xx.cpp @@ -337,3 +337,15 @@ template void h(int i = 0, T ...args, int j = 1) {} #endif } + +namespace cwg794 { // cwg794: 2.7 +struct B {}; +struct D : B {}; +struct X { + D d; +}; +struct Y : X {}; +B Y::*pm = &X::d; +// expected-error@-1 {{cannot initialize a variable of type 'B Y::*' with an rvalue of type 'D cwg794::X::*': different classes ('Y' vs 'cwg794::X')}} +// FIXME: why diagnostic says just `Y` and not `cwg794::Y`, like `cwg794::X`? +} // namespace cwg794 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 239c05e782384..d767d2973b57e 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -4775,7 +4775,7 @@

C++ defect report implementation status

794 NAD Base-derived conversion in member type of pointer-to-member conversion - Unknown + Clang 2.7 795 From ba93eccded30862969a3c5f547d837d6d102c863 Mon Sep 17 00:00:00 2001 From: Anutosh Bhat Date: Sun, 5 Jan 2025 15:19:50 +0530 Subject: [PATCH 190/480] [lld][MachO] Fix warning while building for wasm (#120889) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While building clang & lld against emscripten for wasm, I see the following ``` │ │ /home/runner/work/recipes/recipes/output/bld/rattler-build_llvm_1734801187/work/lld/MachO/SyntheticSections.cpp:2075:25: warning: comparison of integers of │ │ different signs: 'long' and 'const uint32_t' (aka 'const unsigned int') [-Wsign-compare] │ │ 2075 | assert(buf - bufStart == sectionSize && │ │ | ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~ │ │ $BUILD_PREFIX/opt/emsdk/upstream/emscripten/cache/sysroot/include/assert.h:8:28: note: expanded from macro 'assert' │ │ 8 | #define assert(x) ((void)((x) || (__assert_fail(#x, __FILE__, __LINE__, __func__),0))) │ │ | ^ ``` Casting `sectionSize` should be enough I think --- lld/MachO/SyntheticSections.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 28fb8047cacd9..417b7cf93efa7 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -2079,12 +2079,12 @@ void ObjCMethListSection::finalize() { void ObjCMethListSection::writeTo(uint8_t *bufStart) const { uint8_t *buf = bufStart; for (const ConcatInputSection *isec : inputs) { - assert(buf - bufStart == long(isec->outSecOff) && + assert(buf - bufStart == std::ptrdiff_t(isec->outSecOff) && "Writing at unexpected offset"); uint32_t writtenSize = writeRelativeMethodList(isec, buf); buf += writtenSize; } - assert(buf - bufStart == sectionSize && + assert(buf - bufStart == std::ptrdiff_t(sectionSize) && "Written size does not match expected section size"); } From df4a615c988f3ae56f7e68a7df86acb60f16493a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 5 Jan 2025 11:16:00 +0000 Subject: [PATCH 191/480] [VPlan] Convert induction increment check to be VPlan-based. Check the VPlan directly to determine if a VPValue is an optimiziable IV or IV use instead of checking the underlying IR instructions. Split off from https://github.com/llvm/llvm-project/pull/112147. This refactoring enables moving IV end value creation from the legacy fixupIVUsers to a VPlan-based transform. There is one case we now won't optimize, that is IVs with subtracts and non-constant steps. But as this is a minor optimization and doesn't impact correctness, the benefits of performing the check in VPlan should outweigh the missed case. --- .../Transforms/Vectorize/LoopVectorize.cpp | 84 +++++++++++++++---- llvm/lib/Transforms/Vectorize/VPlan.h | 9 ++ .../Transforms/Vectorize/VPlanPatternMatch.h | 21 ++++- .../LoopVectorize/X86/induction-step.ll | 3 +- 4 files changed, 98 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5b75f6b26b6c5..7b4a47dc02b6d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8953,14 +8953,73 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { } } +/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is +/// either an untruncated wide induction, or if it increments a wide induction +/// by its step. +static bool isOptimizableIVOrUse(VPValue *VPV) { + VPRecipeBase *Def = VPV->getDefiningRecipe(); + if (!Def) + return false; + auto *WideIV = dyn_cast(Def); + if (WideIV) { + // VPV itself is a wide induction, separately compute the end value for exit + // users if it is not a truncated IV. + return isa(WideIV) || + !cast(WideIV)->getTruncInst(); + } + + // Check if VPV is an optimizable induction increment. + if (Def->getNumOperands() != 2) + return false; + WideIV = dyn_cast(Def->getOperand(0)); + if (!WideIV) + WideIV = dyn_cast(Def->getOperand(1)); + if (!WideIV) + return false; + + using namespace VPlanPatternMatch; + auto &ID = WideIV->getInductionDescriptor(); + + // Check if VPV increments the induction by the induction step. + VPValue *IVStep = WideIV->getStepValue(); + switch (ID.getInductionOpcode()) { + case Instruction::Add: + return match(VPV, m_c_Binary(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::FAdd: + return match(VPV, m_c_Binary(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::FSub: + return match(VPV, m_Binary(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::Sub: { + // IVStep will be the negated step of the subtraction. Check if Step == -1 * + // IVStep. + VPValue *Step; + if (!match(VPV, m_Binary(m_VPValue(), m_VPValue(Step))) || + !Step->isLiveIn() || !IVStep->isLiveIn()) + return false; + auto *StepCI = dyn_cast(Step->getLiveInIRValue()); + auto *IVStepCI = dyn_cast(IVStep->getLiveInIRValue()); + return StepCI && IVStepCI && + StepCI->getValue() == (-1 * IVStepCI->getValue()); + } + default: + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + match(VPV, m_GetElementPtr(m_Specific(WideIV), + m_Specific(WideIV->getStepValue()))); + } + llvm_unreachable("should have been covered by switch above"); +} + // Collect VPIRInstructions for phis in the exit blocks that are modeled // in VPlan and add the exiting VPValue as operand. Some exiting values are not // modeled explicitly yet and won't be included. Those are un-truncated // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction // increments. -static SetVector collectUsersInExitBlocks( - Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, - const MapVector &Inductions) { +static SetVector +collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, + VPlan &Plan) { auto *MiddleVPBB = Plan.getMiddleBlock(); SetVector ExitUsersToFix; for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { @@ -8985,18 +9044,9 @@ static SetVector collectUsersInExitBlocks( // Exit values for inductions are computed and updated outside of VPlan // and independent of induction recipes. // TODO: Compute induction exit values in VPlan. - if ((isa(V) && - !cast(V)->getTruncInst()) || - isa(V) || - (isa(IncomingValue) && - OrigLoop->contains(cast(IncomingValue)) && - any_of(IncomingValue->users(), [&Inductions](User *U) { - auto *P = dyn_cast(U); - return P && Inductions.contains(P); - }))) { - if (ExitVPBB->getSinglePredecessor() == MiddleVPBB) - continue; - } + if (isOptimizableIVOrUse(V) && + ExitVPBB->getSinglePredecessor() == MiddleVPBB) + continue; ExitUsersToFix.insert(ExitIRI); ExitIRI->addOperand(V); } @@ -9331,8 +9381,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); } addScalarResumePhis(RecipeBuilder, *Plan); - SetVector ExitUsersToFix = collectUsersInExitBlocks( - OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); + SetVector ExitUsersToFix = + collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { reportVectorizationFailure( diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 88f3f672d3aa3..1be57d23f19cf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2095,6 +2095,15 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe { R->getVPDefID() == VPDef::VPWidenPointerInductionSC; } + static inline bool classof(const VPValue *V) { + auto *R = V->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPHeaderPHIRecipe *R) { + return classof(static_cast(R)); + } + virtual void execute(VPTransformState &State) override = 0; /// Returns the step value of the induction. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index ec3c203a61b38..4866426ad8848 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -139,7 +139,8 @@ struct MatchRecipeAndOpcode { if constexpr (std::is_same::value || std::is_same::value || std::is_same::value || - std::is_same::value) + std::is_same::value || + std::is_same::value) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -309,6 +310,12 @@ m_Binary(const Op0_t &Op0, const Op1_t &Op1) { return AllBinaryRecipe_match(Op0, Op1); } +template +inline AllBinaryRecipe_match +m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) { + return AllBinaryRecipe_match(Op0, Op1); +} + template inline AllBinaryRecipe_match m_Mul(const Op0_t &Op0, const Op1_t &Op1) { @@ -339,6 +346,18 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { return m_BinaryOr(Op0, Op1); } +template +using GEPLikeRecipe_match = + BinaryRecipe_match; + +template +inline GEPLikeRecipe_match m_GetElementPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return GEPLikeRecipe_match(Op0, Op1); +} + template using AllTernaryRecipe_match = Recipe_match, Opcode, false, diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll index f6a9767c7f87d..1dd2692ba6822 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll @@ -115,6 +115,7 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -131,7 +132,7 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[SUB_LCSSA]] ; entry: From 64e8d5b1baaa478c40931d290bf30687a6c93dac Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 5 Jan 2025 15:49:55 +0400 Subject: [PATCH 192/480] [clang] Add tests from CWG156 to CWG1111 (dual-scope lookup for conversion-function-ids) (#121654) This patch adds test from [CWG156](https://cplusplus.github.io/CWG/issues/156.html) to [CWG1111](https://cplusplus.github.io/CWG/issues/1111.html) test, and downgrades the latter to partial availability. The most relevant piece of current wording is [[basic.lookup.unqual]/5](https://eel.is/c++draft/basic.lookup#unqual-5): > An unqualified name that is a component name ([[expr.prim.id.unqual]](https://eel.is/c++draft/expr.prim.id.unqual)) of a [type-specifier](https://eel.is/c++draft/dcl.type.general#nt:type-specifier) or [ptr-operator](https://eel.is/c++draft/dcl.decl.general#nt:ptr-operator) of a [conversion-type-id](https://eel.is/c++draft/class.conv.fct#nt:conversion-type-id) is looked up in the same fashion as the [conversion-function-id](https://eel.is/c++draft/class.conv.fct#nt:conversion-function-id) in which it appears[.](https://eel.is/c++draft/basic.lookup#unqual-5.sentence-1) If that lookup finds nothing, it undergoes unqualified name lookup; in each case, only names that denote types or templates whose specializations are types are considered[.](https://eel.is/c++draft/basic.lookup#unqual-5.sentence-2) Per resolution of [CWG1111](https://cplusplus.github.io/CWG/issues/1111.html), additional lookup in the context of the entire postfix-expression, which originally was intended to cross-check lookup in the context of object-expression, was effectively turned into a fallback for it. Check out "Calling a conversion function" example in [P1787R6](https://wg21.link/p1787r6) for step-by-step explanation of the current lookup mechanics for conversion functions. Clang rejects one of the well-formed examples, hence partial status. Clang is the only implementation which rejects it: https://godbolt.org/z/ohhbx8Mfs --- clang/test/CXX/drs/cwg11xx.cpp | 44 +++++++++++++++++++++++++++++++++- clang/test/CXX/drs/cwg1xx.cpp | 1 + clang/www/cxx_dr_status.html | 4 ++-- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/clang/test/CXX/drs/cwg11xx.cpp b/clang/test/CXX/drs/cwg11xx.cpp index 8d187041400a6..d33ff060d2d39 100644 --- a/clang/test/CXX/drs/cwg11xx.cpp +++ b/clang/test/CXX/drs/cwg11xx.cpp @@ -19,7 +19,7 @@ decltype(return_T>())* b; #endif } // namespace cwg1110 -namespace cwg1111 { // cwg1111: 3.2 +namespace cwg1111 { // cwg1111: partial namespace example1 { template struct set; // #cwg1111-struct-set @@ -57,6 +57,48 @@ void baz() { a.operator A(); } } // namespace example2 + +namespace example3 { +struct A { + operator int(); +} a; +void foo() { + typedef int T; + a.operator T(); // T is found using unqualified lookup + // after qualified lookup in A fails. +} +} // namespace example3 + +namespace example4 { +struct A { + typedef int T; // #cwg1111-A-T + operator T(); +}; +struct B : A { + operator T(); +} b; +void foo() { + b.A::operator T(); // FIXME: qualified lookup should find T in A. + // expected-error@-1 {{unknown type name 'T'}} + // expected-note@#cwg1111-A-T {{'A::T' declared here}} +} +} // namespace example4 + +namespace example5 { +template struct A { + operator T1(); +}; +template struct B : A { + operator T2(); + void foo() { + // In both cases, during instantiation, qualified lookup for T2 wouldn't be able + // to find anything, so T2 has to be found by unqualified lookup. + // After that, 'operator T2()' is found in A by qualfied lookup. + T2 a = A::operator T2(); + T2 b = ((A *)this)->operator T2(); + } +}; +} // namespace example5 } // namespace cwg1111 namespace cwg1113 { // cwg1113: partial diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index 9eeca4cb2a681..e8fe3fff43a57 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -922,6 +922,7 @@ namespace cwg155 { // cwg155: dup 632 // expected-warning@-1 {{braces around scalar initializer}} } +// cwg156: sup 1111 // cwg158 is in cwg158.cpp namespace cwg159 { // cwg159: 3.5 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index d767d2973b57e..6ed5e3d37bae7 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -981,7 +981,7 @@

C++ defect report implementation status

156 NAD Name lookup for conversion functions - Unknown + Superseded by 1111 157 @@ -6485,7 +6485,7 @@

C++ defect report implementation status

1111 C++11 Remove dual-scope lookup of member template names - Clang 3.2 + Partial 1112 From 327e2b7c7659e2fff2e644850b767ca77234bef4 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sun, 5 Jan 2025 16:13:44 +0400 Subject: [PATCH 193/480] [clang][NFC] Make all C++ DR tests run in full range of language modes (#121688) This patch plugs holes in RUN lines in C++ DR tests, making sure they are run in C++98 through C++26, with the exception of C++03, which in Clang is synonymous to C++98. --- clang/test/CXX/drs/cwg0xx.cpp | 1 + clang/test/CXX/drs/cwg10xx.cpp | 1 + clang/test/CXX/drs/cwg118.cpp | 6 ++++- clang/test/CXX/drs/cwg11xx.cpp | 4 +++- clang/test/CXX/drs/cwg12xx.cpp | 1 + clang/test/CXX/drs/cwg158.cpp | 5 ++++- clang/test/CXX/drs/cwg1748.cpp | 5 ++++- clang/test/CXX/drs/cwg177x.cpp | 5 ++++- clang/test/CXX/drs/cwg1xx.cpp | 1 + clang/test/CXX/drs/cwg2771.cpp | 22 ++++++++++++------- clang/test/CXX/drs/cwg2xx.cpp | 1 + clang/test/CXX/drs/cwg3xx.cpp | 11 +++++----- .../test/CXX/drs/{cwgr593.cpp => cwg593.cpp} | 0 clang/test/CXX/drs/cwg6xx.cpp | 1 + clang/test/CXX/drs/cwg7xx.cpp | 4 +++- clang/test/CXX/drs/cwg9xx.cpp | 1 + 16 files changed, 50 insertions(+), 19 deletions(-) rename clang/test/CXX/drs/{cwgr593.cpp => cwg593.cpp} (100%) diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp index 993b6f2923859..8f7bd6532ae6d 100644 --- a/clang/test/CXX/drs/cwg0xx.cpp +++ b/clang/test/CXX/drs/cwg0xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple +// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple #if __cplusplus == 199711L #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) diff --git a/clang/test/CXX/drs/cwg10xx.cpp b/clang/test/CXX/drs/cwg10xx.cpp index 58d552942c77c..01de13238a6ae 100644 --- a/clang/test/CXX/drs/cwg10xx.cpp +++ b/clang/test/CXX/drs/cwg10xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors namespace std { __extension__ typedef __SIZE_TYPE__ size_t; diff --git a/clang/test/CXX/drs/cwg118.cpp b/clang/test/CXX/drs/cwg118.cpp index 04e19ce050788..c7685fdb4f5b1 100644 --- a/clang/test/CXX/drs/cwg118.cpp +++ b/clang/test/CXX/drs/cwg118.cpp @@ -1,7 +1,11 @@ // RUN: %clang_cc1 -triple x86_64-linux -std=c++98 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " // RUN: %clang_cc1 -triple x86_64-linux -std=c++11 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " // RUN: %clang_cc1 -triple x86_64-linux -std=c++14 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " -// RUN: %clang_cc1 -triple x86_64-linux -std=c++1z %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " +// RUN: %clang_cc1 -triple x86_64-linux -std=c++17 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " +// RUN: %clang_cc1 -triple x86_64-linux -std=c++20 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " +// RUN: %clang_cc1 -triple x86_64-linux -std=c++23 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " +// RUN: %clang_cc1 -triple x86_64-linux -std=c++2c %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call " + // cwg118: yes diff --git a/clang/test/CXX/drs/cwg11xx.cpp b/clang/test/CXX/drs/cwg11xx.cpp index d33ff060d2d39..dc024caa5075b 100644 --- a/clang/test/CXX/drs/cwg11xx.cpp +++ b/clang/test/CXX/drs/cwg11xx.cpp @@ -2,7 +2,9 @@ // RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2a %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors namespace cwg1110 { // cwg1110: 3.1 #if __cplusplus >= 201103L diff --git a/clang/test/CXX/drs/cwg12xx.cpp b/clang/test/CXX/drs/cwg12xx.cpp index 951c71a9832de..1011afa5905e7 100644 --- a/clang/test/CXX/drs/cwg12xx.cpp +++ b/clang/test/CXX/drs/cwg12xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11,since-cxx23 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx17,since-cxx14,since-cxx11,since-cxx23 -fexceptions -fcxx-exceptions -pedantic-errors // cwg1200: na diff --git a/clang/test/CXX/drs/cwg158.cpp b/clang/test/CXX/drs/cwg158.cpp index 2a74438264777..1f5c319e6bd25 100644 --- a/clang/test/CXX/drs/cwg158.cpp +++ b/clang/test/CXX/drs/cwg158.cpp @@ -1,7 +1,10 @@ // RUN: %clang_cc1 -triple x86_64-linux -std=c++98 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s // RUN: %clang_cc1 -triple x86_64-linux -std=c++11 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s // RUN: %clang_cc1 -triple x86_64-linux -std=c++14 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s -// RUN: %clang_cc1 -triple x86_64-linux -std=c++1z %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple x86_64-linux -std=c++17 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple x86_64-linux -std=c++20 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple x86_64-linux -std=c++23 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple x86_64-linux -std=c++2c %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s // RUN: %clang_cc1 -triple x86_64-linux -std=c++1z %s -O3 -pointer-tbaa -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK,POINTER-TBAA %s // cwg158: yes diff --git a/clang/test/CXX/drs/cwg1748.cpp b/clang/test/CXX/drs/cwg1748.cpp index f216963d69f2a..a0fe737539392 100644 --- a/clang/test/CXX/drs/cwg1748.cpp +++ b/clang/test/CXX/drs/cwg1748.cpp @@ -1,7 +1,10 @@ // RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s // RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s // RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s -// RUN: %clang_cc1 -std=c++1z %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s +// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s +// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s +// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s +// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s // cwg1748: 3.7 diff --git a/clang/test/CXX/drs/cwg177x.cpp b/clang/test/CXX/drs/cwg177x.cpp index cc62bdac4cf06..72a12c2f92c20 100644 --- a/clang/test/CXX/drs/cwg177x.cpp +++ b/clang/test/CXX/drs/cwg177x.cpp @@ -1,7 +1,10 @@ // RUN: %clang_cc1 -std=c++98 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s // RUN: %clang_cc1 -std=c++11 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11 // RUN: %clang_cc1 -std=c++14 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 -// RUN: %clang_cc1 -std=c++1z %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 +// RUN: %clang_cc1 -std=c++17 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 +// RUN: %clang_cc1 -std=c++20 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 +// RUN: %clang_cc1 -std=c++23 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 +// RUN: %clang_cc1 -std=c++2c %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 // RUN: %clang_cc1 -std=c++1z %s -fexceptions -fcxx-exceptions -pedantic-errors -triple i386-windows-pc -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14 namespace cwg1772 { // cwg1772: 14 diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index e8fe3fff43a57..cc1dd784b127d 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors #if __cplusplus == 199711L #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) diff --git a/clang/test/CXX/drs/cwg2771.cpp b/clang/test/CXX/drs/cwg2771.cpp index 474660aa28440..2dd446ac06142 100644 --- a/clang/test/CXX/drs/cwg2771.cpp +++ b/clang/test/CXX/drs/cwg2771.cpp @@ -1,4 +1,10 @@ -// RUN: %clang_cc1 -std=c++23 %s -ast-dump | FileCheck --check-prefixes=CXX23 %s +// RUN: %clang_cc1 -std=c++98 %s -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++11 %s -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++14 %s -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++17 %s -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++20 %s -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++23 %s -ast-dump | FileCheck %s +// RUN: %clang_cc1 -std=c++2c %s -ast-dump | FileCheck %s namespace cwg2771 { // cwg2771: 18 @@ -8,12 +14,12 @@ struct A{ int* r = &a; } }; -// CXX23: CXXMethodDecl{{.+}}cwg2771 -// CXX23-NEXT: CompoundStmt -// CXX23-NEXT: DeclStmt -// CXX23-NEXT: VarDecl -// CXX23-NEXT: UnaryOperator -// CXX23-NEXT: MemberExpr -// CXX23-NEXT: CXXThisExpr{{.+}}'cwg2771::A *' +// CHECK: CXXMethodDecl{{.+}}cwg2771 +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: DeclStmt +// CHECK-NEXT: VarDecl +// CHECK-NEXT: UnaryOperator +// CHECK-NEXT: MemberExpr +// CHECK-NEXT: CXXThisExpr{{.+}}'cwg2771::A *' } // namespace cwg2771 diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index ec37b420880e2..19b0685b73381 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors // FIXME: diagnostic above is emitted only on Windows platforms // PR13819 -- __SIZE_TYPE__ is incompatible. diff --git a/clang/test/CXX/drs/cwg3xx.cpp b/clang/test/CXX/drs/cwg3xx.cpp index 26b0f975effa1..abe106716f6f9 100644 --- a/clang/test/CXX/drs/cwg3xx.cpp +++ b/clang/test/CXX/drs/cwg3xx.cpp @@ -1,9 +1,10 @@ -// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20-23,cxx23,since-cxx11,since-cxx17,since-cxx23 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 -verify=expected,cxx98-20,cxx20-23,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 -verify=expected,cxx98-17,cxx98-20,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++98 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx98 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 -verify=expected,cxx98-17,cxx98-20,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 -verify=expected,cxx98-20,cxx20-23,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20-23,cxx23,since-cxx11,since-cxx17,since-cxx23 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c -verify=expected,cxx20-23,cxx23,since-cxx11,since-cxx17,since-cxx23 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors #if __cplusplus == 199711L #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) diff --git a/clang/test/CXX/drs/cwgr593.cpp b/clang/test/CXX/drs/cwg593.cpp similarity index 100% rename from clang/test/CXX/drs/cwgr593.cpp rename to clang/test/CXX/drs/cwg593.cpp diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp index 1c56dd3907152..7920c4383f475 100644 --- a/clang/test/CXX/drs/cwg6xx.cpp +++ b/clang/test/CXX/drs/cwg6xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking // RUN: %clang_cc1 -std=c++20 %s -verify=expected,cxx11-20,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking +// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking #if __cplusplus == 199711L #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) diff --git a/clang/test/CXX/drs/cwg7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp index 842d172d37900..d606542ff14d9 100644 --- a/clang/test/CXX/drs/cwg7xx.cpp +++ b/clang/test/CXX/drs/cwg7xx.cpp @@ -2,7 +2,9 @@ // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 %s -verify=expected,cxx98-14,cxx98-11,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++14 %s -verify=expected,cxx98-14,since-cxx14,since-cxx11,cxx14 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++17 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++2a %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++23 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++2c %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors #if __cplusplus == 199711L #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp index d4f54bcdad6ea..9b1387ca0e768 100644 --- a/clang/test/CXX/drs/cwg9xx.cpp +++ b/clang/test/CXX/drs/cwg9xx.cpp @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors namespace std { __extension__ typedef __SIZE_TYPE__ size_t; From 054e7c59713c67ad7b65a92e4b8887076d3881b9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 5 Jan 2025 13:02:31 +0000 Subject: [PATCH 194/480] [VectorCombine] foldInsExtVectorToShuffle - ignore shuffle costs for 'identity' insertion masks 'inplace' single src shuffles can be treated as free identity shuffles - ignore any shuffle cost (similar to what we already do in other folds like foldShuffleOfShuffles) - eventually getShuffleCost should just return TCC_Free in these cases but in a lot of the targets' shuffle cost logic this currently ends up treated as a generic SK_PermuteSingleSrc. We still want to generate the shuffle as it will help further shuffle folds with the additional PoisonMaskElem 'undemanded' elements. --- .../Transforms/Vectorize/VectorCombine.cpp | 8 +++- .../test/Transforms/PhaseOrdering/X86/hadd.ll | 45 +++++++------------ .../VectorCombine/X86/load-inseltpoison.ll | 5 +-- 3 files changed, 22 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 9bca613593591..428f7d94ca8af 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3097,8 +3097,12 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); InstructionCost OldCost = ExtCost + InsCost; - InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, - nullptr, {DstVec, SrcVec}); + // Ignore 'free' identity insertion shuffle. + // TODO: getShuffleCost should return TCC_Free for Identity shuffles. + InstructionCost NewCost = 0; + if (!ShuffleVectorInst::isIdentityMask(Mask, NumElts)) + NewCost += TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, nullptr, + {DstVec, SrcVec}); if (!Ext->hasOneUse()) NewCost += ExtCost; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll index 67da29b6cee7d..57d4d4554a0cd 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll @@ -512,13 +512,10 @@ define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) { define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @add_v4f32_0u23( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[RESULT1]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -542,13 +539,10 @@ define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) { define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: @add_v4f32_01u3( -; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; SSE2-NEXT: ret <4 x float> [[RESULT1]] +; SSE2-NEXT: ret <4 x float> [[TMP4]] ; ; SSE4-LABEL: @add_v4f32_01u3( ; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> @@ -563,13 +557,10 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) { ; AVX2-NEXT: ret <4 x float> [[TMP4]] ; ; AVX512-LABEL: @add_v4f32_01u3( -; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; AVX512-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; AVX512-NEXT: ret <4 x float> [[RESULT1]] +; AVX512-NEXT: ret <4 x float> [[TMP4]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -593,13 +584,10 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) { define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: @add_v4f32_012u( -; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; SSE2-NEXT: ret <4 x float> [[RESULT1]] +; SSE2-NEXT: ret <4 x float> [[TMP4]] ; ; SSE4-LABEL: @add_v4f32_012u( ; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> @@ -620,13 +608,10 @@ define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) { ; AVX2-NEXT: ret <4 x float> [[RESULT]] ; ; AVX512-LABEL: @add_v4f32_012u( -; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; AVX512-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] -; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> -; AVX512-NEXT: ret <4 x float> [[RESULT1]] +; AVX512-NEXT: ret <4 x float> [[TMP4]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 937d4043adc0c..2db1e21b3e95a 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -544,10 +544,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc ; CHECK-NEXT: [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0 ; CHECK-NEXT: [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1 ; CHECK-NEXT: [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]] -; CHECK-NEXT: [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0 -; CHECK-NEXT: [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0 -; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8 +; CHECK-NEXT: store <2 x float> [[T3]], ptr [[RESULTPTR:%.*]], align 8 ; CHECK-NEXT: ret void ; %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr() From 1d155412547b0073ecbbdccb02acc31273f3be49 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sun, 5 Jan 2025 08:08:50 -0500 Subject: [PATCH 195/480] [libc++] Remove abandoned __append declaration in vector (#121673) The `vector` implementation in libcxx contains a declaration of a private `__append` function, which is neither defined nor used anywhere in the codebase. This PR aims to remove this abandoned declaration, as its presence is misleading and could lead to confusion during future maintenance. I have no idea why we have a declaration without a definition. My guess is that the declaration might be inherited from the implementation of `vector`, where `__append` is both necessary and properly defined. The declaration may have been inadvertently copied from `vector` to `vector` and subsequently abandoned, as `vector` never needs it. --- libcxx/include/__vector/vector_bool.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h index 525fc35b26cc9..8658745b8a8f9 100644 --- a/libcxx/include/__vector/vector_bool.h +++ b/libcxx/include/__vector/vector_bool.h @@ -442,7 +442,6 @@ class _LIBCPP_TEMPLATE_VIS vector { template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __append(size_type __n, const_reference __x); _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference __make_ref(size_type __pos) _NOEXCEPT { return reference(__begin_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); } From 2adcec7780f3d3027f76c80dbd73085fdee144d7 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Sun, 5 Jan 2025 21:15:17 +0800 Subject: [PATCH 196/480] [InstCombine] Simplify with.overflow intrinsics with assumption information (#84016) This patch recognizes never-overflow assumptions generated by rustc to improve the codegen. Please refer to https://github.com/rust-lang/hashbrown/issues/509 for more details. Closes https://github.com/rust-lang/hashbrown/issues/509 Closes https://github.com/llvm/llvm-project/issues/80637 --- .../InstCombine/InstCombineCalls.cpp | 29 ++++++ llvm/test/Transforms/InstCombine/overflow.ll | 98 ++++++++++++++++++- 2 files changed, 122 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index fd38738e3be80..cdb2c11ef046a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -839,6 +839,35 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), WO->getRHS(), *WO, OperationResult, OverflowResult)) return createOverflowTuple(WO, OperationResult, OverflowResult); + + // See whether we can optimize the overflow check with assumption information. + for (User *U : WO->users()) { + if (!match(U, m_ExtractValue<1>(m_Value()))) + continue; + + for (auto &AssumeVH : AC.assumptionsFor(U)) { + if (!AssumeVH) + continue; + CallInst *I = cast(AssumeVH); + if (!match(I->getArgOperand(0), m_Not(m_Specific(U)))) + continue; + if (!isValidAssumeForContext(I, II, /*DT=*/nullptr, + /*AllowEphemerals=*/true)) + continue; + Value *Result = + Builder.CreateBinOp(WO->getBinaryOp(), WO->getLHS(), WO->getRHS()); + Result->takeName(WO); + if (auto *Inst = dyn_cast(Result)) { + if (WO->isSigned()) + Inst->setHasNoSignedWrap(); + else + Inst->setHasNoUnsignedWrap(); + } + return createOverflowTuple(WO, Result, + ConstantInt::getFalse(U->getType())); + } + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/overflow.ll b/llvm/test/Transforms/InstCombine/overflow.ll index a8969a5ed02c3..22e1631f78ee9 100644 --- a/llvm/test/Transforms/InstCombine/overflow.ll +++ b/llvm/test/Transforms/InstCombine/overflow.ll @@ -11,7 +11,7 @@ define i32 @test1(i32 %a, i32 %b) nounwind ssp { ; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { i32, i1 } [[SADD]], 1 ; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: -; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR2:[0-9]+]] +; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR3:[0-9]+]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[SADD_RESULT:%.*]] = extractvalue { i32, i1 } [[SADD]], 0 @@ -49,7 +49,7 @@ define i32 @test2(i32 %a, i32 %b, ptr %P) nounwind ssp { ; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295 ; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: -; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR2]] +; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR3]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[CONV9:%.*]] = trunc i64 [[ADD]] to i32 @@ -86,7 +86,7 @@ define i64 @test3(i32 %a, i32 %b) nounwind ssp { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296 ; CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: -; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR2]] +; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR3]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: ret i64 [[ADD]] @@ -116,7 +116,7 @@ define zeroext i8 @test4(i8 signext %a, i8 signext %b) nounwind ssp { ; CHECK-NEXT: [[CMP:%.*]] = extractvalue { i8, i1 } [[SADD]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: -; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR2]] +; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR3]] ; CHECK-NEXT: unreachable ; CHECK: if.end: ; CHECK-NEXT: [[SADD_RESULT:%.*]] = extractvalue { i8, i1 } [[SADD]], 0 @@ -150,7 +150,7 @@ define i32 @test8(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296 ; CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: -; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR2]] +; CHECK-NEXT: tail call void @throwAnExceptionOrWhatever() #[[ATTR3]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[CONV9:%.*]] = trunc i64 [[ADD]] to i32 @@ -171,3 +171,91 @@ if.end: ret i32 %conv9 } +define i32 @uadd_no_overflow(i32 %a, i32 %b) { +; CHECK-LABEL: @uadd_no_overflow( +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP1]] +; + %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %ov = extractvalue { i32, i1 } %val, 1 + %nowrap = xor i1 %ov, true + tail call void @llvm.assume(i1 %nowrap) + %res = extractvalue { i32, i1 } %val, 0 + ret i32 %res +} + +define i32 @smul_no_overflow(i32 %a, i32 %b) { +; CHECK-LABEL: @smul_no_overflow( +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP1]] +; + %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b) + %ov = extractvalue { i32, i1 } %val, 1 + %nowrap = xor i1 %ov, true + tail call void @llvm.assume(i1 %nowrap) + %res = extractvalue { i32, i1 } %val, 0 + ret i32 %res +} + +define i32 @smul_overflow(i32 %a, i32 %b) { +; CHECK-LABEL: @smul_overflow( +; CHECK-NEXT: [[VAL:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[OV]]) +; CHECK-NEXT: [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 +; CHECK-NEXT: ret i32 [[RES]] +; + %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b) + %ov = extractvalue { i32, i1 } %val, 1 + tail call void @llvm.assume(i1 %ov) + %res = extractvalue { i32, i1 } %val, 0 + ret i32 %res +} + +define i32 @uadd_no_overflow_invalid1(i32 %a, i32 %b, i1 %cond) { +; CHECK-LABEL: @uadd_no_overflow_invalid1( +; CHECK-NEXT: [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 +; CHECK-NEXT: call void @use(i32 [[RES]]) +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1 +; CHECK-NEXT: [[NOWRAP:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: tail call void @llvm.assume(i1 [[NOWRAP]]) +; CHECK-NEXT: ret i32 [[RES]] +; CHECK: if.else: +; CHECK-NEXT: ret i32 0 +; + %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %res = extractvalue { i32, i1 } %val, 0 + call void @use(i32 %res) + br i1 %cond, label %if.then, label %if.else +if.then: + %ov = extractvalue { i32, i1 } %val, 1 + %nowrap = xor i1 %ov, true + tail call void @llvm.assume(i1 %nowrap) + ret i32 %res +if.else: + ret i32 0 +} + +define i32 @uadd_no_overflow_invalid2(i32 %a, i32 %b, i1 %cond) { +; CHECK-LABEL: @uadd_no_overflow_invalid2( +; CHECK-NEXT: [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1 +; CHECK-NEXT: [[NOWRAP:%.*]] = xor i1 [[OV]], true +; CHECK-NEXT: call void @use(i32 0) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[NOWRAP]]) +; CHECK-NEXT: [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 +; CHECK-NEXT: ret i32 [[RES]] +; + %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %ov = extractvalue { i32, i1 } %val, 1 + %nowrap = xor i1 %ov, true + call void @use(i32 0) ; It is not guaranteed to transfer execution to its successors + tail call void @llvm.assume(i1 %nowrap) + %res = extractvalue { i32, i1 } %val, 0 + ret i32 %res +} + +declare void @use(i32) From f48884ded884d982a7fd13394b0e93e6588f4143 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 5 Jan 2025 15:50:42 +0000 Subject: [PATCH 197/480] [VPlan] Remove loop region in optimizeForVFAndUF. (#108378) Update optimizeForVFAndUF to completely remove the vector loop region when possible. At the moment, we cannot remove the region if it contains * widened IVs: the recipe is needed to generate the step vector * reductions: ComputeReductionResults requires the reduction phi recipe for codegen. Both cases can be addressed by more explicit modeling. The patch also includes a number of updates to allow executing VPlans without a vector loop region. Depends on https://github.com/llvm/llvm-project/pull/110004 --- .../Transforms/Vectorize/LoopVectorize.cpp | 87 +-- llvm/lib/Transforms/Vectorize/VPlan.cpp | 35 +- llvm/lib/Transforms/Vectorize/VPlan.h | 8 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 56 +- .../LoopVectorize/AArch64/call-costs.ll | 15 +- .../LoopVectorize/RISCV/low-trip-count.ll | 164 +++--- .../LoopVectorize/RISCV/short-trip-count.ll | 40 +- .../truncate-to-minimal-bitwidth-cost.ll | 27 +- .../LoopVectorize/SystemZ/pr47665.ll | 101 ++-- ...demanding-all-lanes-and-first-lane-only.ll | 48 +- .../LoopVectorize/X86/constant-fold.ll | 16 +- .../Transforms/LoopVectorize/X86/pr34438.ll | 24 +- .../debugloc-optimize-vfuf-term.ll | 41 +- .../LoopVectorize/first-order-recurrence.ll | 27 +- .../vector-loop-backedge-elimination.ll | 505 ++++++++---------- .../version-stride-with-integer-casts.ll | 14 +- .../vplan-printing-before-execute.ll | 35 +- 17 files changed, 567 insertions(+), 676 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7b4a47dc02b6d..0d441d81ac1a2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2394,12 +2394,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, // End if-block. VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; - assert((Parent || all_of(RepRecipe->operands(), - [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); + assert( + (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + all_of(RepRecipe->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } @@ -3012,6 +3012,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); } + // Don't apply optimizations below when no vector region remains, as they all + // require a vector loop at the moment. + if (!State.Plan->getVectorLoopRegion()) + return; + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -7744,6 +7749,8 @@ DenseMap LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. + VPBasicBlock *VectorPH = + cast(BestVPlan.getEntry()->getSingleSuccessor()); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); if (VectorizingEpilogue) @@ -7781,7 +7788,7 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.prepareToExecute( ILV.getTripCount(), ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); - replaceVPBBWithIRVPBB(BestVPlan.getVectorPreheader(), State.CFG.PrevBB); + replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); BestVPlan.execute(&State); @@ -7807,30 +7814,31 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). - MDNode *OrigLoopID = OrigLoop->getLoopID(); + if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { + MDNode *OrigLoopID = OrigLoop->getLoopID(); - std::optional VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - - VPBasicBlock *HeaderVPBB = - BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) - L->setLoopID(*VectorizedLoopID); - else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); + std::optional VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) { + L->setLoopID(*VectorizedLoopID); + } else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) + addRuntimeUnrollDisableMetaData(L); } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) - addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7839,15 +7847,18 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (BestVPlan.getVectorLoopRegion()) { + auto *MiddleVPBB = BestVPlan.getMiddleBlock(); + auto *MiddleTerm = + cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return State.ExpandedSCEVs; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 06c36396a17f3..e804f81c36dba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -555,7 +555,9 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { template static T *getEnclosingLoopRegionForRegion(T *P) { if (P && P->isReplicator()) { P = P->getParent(); - assert(!cast(P)->isReplicator() && + // Multiple loop regions can be nested, but replicate regions can only be + // nested inside a loop region or must be outside any other region. + assert((!P || !cast(P)->isReplicator()) && "unexpected nested replicate regions"); } return P; @@ -934,7 +936,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); + assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) && + "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); @@ -988,12 +991,18 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); + State->CFG.DTU.flush(); + + auto *LoopRegion = getVectorLoopRegion(); + if (!LoopRegion) + return; + + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { // Skip phi-like recipes that generate their backedege values themselves. if (isa(&R)) @@ -1032,8 +1041,6 @@ void VPlan::execute(VPTransformState *State) { Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); cast(Phi)->addIncoming(Val, VectorLatchBB); } - - State->CFG.DTU.flush(); } InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { @@ -1046,14 +1053,14 @@ VPRegionBlock *VPlan::getVectorLoopRegion() { // TODO: Cache if possible. for (VPBlockBase *B : vp_depth_first_shallow(getEntry())) if (auto *R = dyn_cast(B)) - return R; + return R->isReplicator() ? nullptr : R; return nullptr; } const VPRegionBlock *VPlan::getVectorLoopRegion() const { for (const VPBlockBase *B : vp_depth_first_shallow(getEntry())) if (auto *R = dyn_cast(B)) - return R; + return R->isReplicator() ? nullptr : R; return nullptr; } @@ -1399,11 +1406,17 @@ void VPlanIngredient::print(raw_ostream &O) const { #endif -bool VPValue::isDefinedOutsideLoopRegions() const { - return !hasDefiningRecipe() || - !getDefiningRecipe()->getParent()->getEnclosingLoopRegion(); +/// Returns true if there is a vector loop region and \p VPV is defined in a +/// loop region. +static bool isDefinedInsideLoopRegions(const VPValue *VPV) { + const VPRecipeBase *DefR = VPV->getDefiningRecipe(); + return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() || + DefR->getParent()->getEnclosingLoopRegion()); } +bool VPValue::isDefinedOutsideLoopRegions() const { + return !isDefinedInsideLoopRegions(this); +} void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1be57d23f19cf..5d2914a7d8323 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3853,9 +3853,13 @@ class VPlan { VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } - /// Returns the preheader of the vector loop region. + /// Returns the preheader of the vector loop region, if one exists, or null + /// otherwise. VPBasicBlock *getVectorPreheader() { - return cast(getVectorLoopRegion()->getSinglePredecessor()); + VPRegionBlock *VectorRegion = getVectorLoopRegion(); + return VectorRegion + ? cast(VectorRegion->getSinglePredecessor()) + : nullptr; } /// Returns the VPRegionBlock of the vector loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c5a73021ca8cb..433a4c8184fdf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -794,12 +794,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); } -/// Try to simplify the recipes in \p Plan -static void simplifyRecipes(VPlan &Plan) { +/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all +/// un-typed live-ins in VPTypeAnalysis. +static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); + VPTypeAnalysis TypeInfo(CanonicalIVTy); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { simplifyRecipe(R, TypeInfo); @@ -812,8 +812,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, PredicatedScalarEvolution &PSE) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - VPBasicBlock *ExitingVPBB = - Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); // Try to simplify the branch condition if TC <= VF * UF when preparing to // execute the plan for the main vector loop. We only do this if the @@ -837,14 +837,42 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - LLVMContext &Ctx = SE.getContext(); - auto *BOC = new VPInstruction( - VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + // The vector loop region only executes once. If possible, completely remove + // the region, otherwise replace the terminator controlling the latch with + // (BranchOnCond true). + auto *Header = cast(VectorRegion->getEntry()); + auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); + if (all_of( + Header->phis(), + IsaPred)) { + for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) { + auto *HeaderPhiR = cast(&HeaderR); + HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue()); + HeaderPhiR->eraseFromParent(); + } - Term->eraseFromParent(); - ExitingVPBB->appendRecipe(BOC); + VPBlockBase *Preheader = VectorRegion->getSinglePredecessor(); + VPBlockBase *Exit = VectorRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Preheader, VectorRegion); + VPBlockUtils::disconnectBlocks(VectorRegion, Exit); + + for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) + B->setParent(nullptr); + VPBlockUtils::connectBlocks(Preheader, Header); + VPBlockUtils::connectBlocks(ExitingVPBB, Exit); + simplifyRecipes(Plan, CanIVTy); + } else { + // The vector region contains header phis for which we cannot remove the + // loop region yet. + LLVMContext &Ctx = SE.getContext(); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + ExitingVPBB->appendRecipe(BOC); + } + + Term->eraseFromParent(); VPlanTransforms::removeDeadRecipes(Plan); Plan.setVF(BestVF); @@ -1258,10 +1286,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); legalizeAndOptimizeInductions(Plan); removeRedundantExpandSCEVRecipes(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); removeDeadRecipes(Plan); createAndOptimizeReplicateRegions(Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll index 4f050877bd131..e63155b024c43 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll @@ -80,15 +80,13 @@ define void @powi_call(ptr %P) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3) -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8 +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -102,7 +100,7 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -233,6 +231,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 5c5600b9cfdf8..10ac870c112ae 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -49,28 +49,26 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -78,16 +76,16 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -118,28 +116,26 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -147,16 +143,16 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -187,28 +183,26 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -216,16 +210,16 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -256,19 +250,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -285,7 +277,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -317,19 +309,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -346,7 +336,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -390,7 +380,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -407,7 +397,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 375278eea38f9..3386a7d3972aa 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -14,16 +14,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -31,13 +29,13 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -73,16 +71,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -90,13 +86,13 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index bfdcfbf5139b3..f38aa11b5af87 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -163,20 +163,14 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) ; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -194,7 +188,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -293,7 +287,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -314,7 +308,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -354,8 +348,7 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 98245fcc0f0af..2de0f7e4d4016 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -19,125 +19,103 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP10]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: store i1 [[TMP9]], ptr [[P]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2 ; CHECK-NEXT: store i1 [[TMP12]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3 ; CHECK-NEXT: store i1 [[TMP14]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4 ; CHECK-NEXT: store i1 [[TMP16]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5 ; CHECK-NEXT: store i1 [[TMP18]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6 ; CHECK-NEXT: store i1 [[TMP20]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7 ; CHECK-NEXT: store i1 [[TMP22]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] ; CHECK: pred.store.if15: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8 ; CHECK-NEXT: store i1 [[TMP24]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9 ; CHECK-NEXT: store i1 [[TMP26]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10 ; CHECK-NEXT: store i1 [[TMP28]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; CHECK: pred.store.if21: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11 ; CHECK-NEXT: store i1 [[TMP30]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12 ; CHECK-NEXT: store i1 [[TMP32]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13 ; CHECK-NEXT: store i1 [[TMP34]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14 ; CHECK-NEXT: store i1 [[TMP36]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] ; CHECK: pred.store.if29: -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15 ; CHECK-NEXT: store i1 [[TMP38]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.if31: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: store i1 [[TMP40]], ptr [[P]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.continue32: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: store i1 [[ICMP_SGT]], ptr [[P]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10 -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -183,7 +161,6 @@ exit: ; preds = %for.body } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index fcf1ba072a62c..61bcbaa1fe4d2 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -16,30 +16,24 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 1, 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 2, 4 +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw i64 3, 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 @@ -64,14 +58,13 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: store i32 [[TMP31]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -93,7 +86,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -125,7 +118,6 @@ exit: } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index 61cae9c1b3f5d..83e2f84814add 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -17,15 +17,11 @@ define void @f1() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x ptr> , ptr [[TMP3]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = sext i16 0 to i64 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x ptr> , ptr [[TMP2]], align 8 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -40,7 +36,7 @@ define void @f1() { ; CHECK-NEXT: store ptr [[_TMP2]], ptr [[_TMP7]], align 8 ; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 ; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 -; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: bb3: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index cc60359af2f8c..7816c4918761f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -16,18 +16,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -43,7 +41,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll index 04ce9562c04b5..0f34f6243f155 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll @@ -11,22 +11,20 @@ define i32 @foo(ptr %p) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]] -; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]] -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]] +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]] -; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]] -; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]] -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]] +; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 0 ; @@ -63,17 +61,16 @@ exit: ; preds = %loop ;. ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}}) -; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]]) +; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]]) ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]]) ; CHECK: [[META6]] = !{} -; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]]) -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]} -; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]]) -; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]]) -; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]]) -; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]]) -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]} +; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]]) +; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]]) +; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]]) +; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]]) +; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]]) +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]} +; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 7f562a4f2c445..509b8f99e4b61 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -3707,13 +3707,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; UNROLL-NO-IC-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: @@ -3726,7 +3721,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-IC-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: exit: ; UNROLL-NO-IC-NEXT: ret i32 0 ; @@ -3736,11 +3731,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; UNROLL-NO-VF-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3753,7 +3745,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-VF-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-VF-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-VF: exit: ; UNROLL-NO-VF-NEXT: ret i32 0 ; @@ -3763,13 +3755,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] ; SINK-AFTER: vector.body: -; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; SINK-AFTER-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SINK-AFTER-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; SINK-AFTER-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: @@ -3782,7 +3769,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; SINK-AFTER-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; SINK-AFTER-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; SINK-AFTER: exit: ; SINK-AFTER-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index 7e6e5249381cd..fe5811e7e1159 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -63,19 +63,18 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 ; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1 ; VF8UF2-NEXT: [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) ; VF8UF2-NEXT: [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10) -; VF8UF2-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP2]], align 1 -; VF8UF2-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP3]], align 1 -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF8UF2-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 +; VF8UF2-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1 +; VF8UF2-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1 +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] ; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -92,7 +91,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF8UF2-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 ; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: ret void ; @@ -108,15 +107,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0 ; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; VF16UF1-NEXT: [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10) -; VF16UF1-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP2]], align 1 -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF16UF1-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1 +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] ; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -133,7 +130,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF16UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 ; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: ret void ; @@ -168,82 +165,68 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF1: [[VECTOR_BODY]]: -; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE16:.*]] ] -; VF8UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 +; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; VF8UF1-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF1-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF8UF1: [[PRED_STORE_IF]]: -; VF8UF1-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP20]] +; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF8UF1-NEXT: store i16 0, ptr [[TMP4]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF8UF1: [[PRED_STORE_CONTINUE]]: ; VF8UF1-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1 -; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; VF8UF1: [[PRED_STORE_IF3]]: -; VF8UF1-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF8UF1: [[PRED_STORE_IF1]]: +; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF8UF1-NEXT: store i16 0, ptr [[TMP6]], align 2 +; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF8UF1: [[PRED_STORE_CONTINUE2]]: +; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 +; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF8UF1: [[PRED_STORE_IF3]]: +; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4 +; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; VF8UF1: [[PRED_STORE_CONTINUE4]]: -; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 -; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 +; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; VF8UF1: [[PRED_STORE_IF5]]: -; VF8UF1-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP23]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 +; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5 +; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; VF8UF1: [[PRED_STORE_CONTINUE6]]: -; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 -; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 +; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; VF8UF1: [[PRED_STORE_IF7]]: -; VF8UF1-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP24]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 +; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6 +; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; VF8UF1: [[PRED_STORE_CONTINUE8]]: -; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 -; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 +; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; VF8UF1: [[PRED_STORE_IF9]]: -; VF8UF1-NEXT: [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP26]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 +; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7 +; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; VF8UF1: [[PRED_STORE_CONTINUE10]]: -; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 -; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 +; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; VF8UF1: [[PRED_STORE_IF11]]: -; VF8UF1-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP19]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 +; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8 +; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; VF8UF1: [[PRED_STORE_CONTINUE12]]: -; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 -; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 +; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; VF8UF1: [[PRED_STORE_IF13]]: -; VF8UF1-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP22]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 +; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9 +; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; VF8UF1: [[PRED_STORE_CONTINUE14]]: -; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 -; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16]] -; VF8UF1: [[PRED_STORE_IF15]]: -; VF8UF1-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP25]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 -; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]] -; VF8UF1: [[PRED_STORE_CONTINUE16]]: -; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; VF8UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF1: [[MIDDLE_BLOCK]]: ; VF8UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF1: [[SCALAR_PH]]: @@ -255,7 +238,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF8UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; VF8UF1: [[EXIT]]: ; VF8UF1-NEXT: ret void ; @@ -270,148 +253,125 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF2-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ] -; VF8UF2-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 +; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; VF8UF2-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF2-NEXT: [[VEC_IV3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> [[VEC_IV3]], [[BROADCAST_SPLAT]] +; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] +; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF2-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF8UF2: [[PRED_STORE_IF]]: -; VF8UF2-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]] +; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF8UF2-NEXT: store i16 0, ptr [[TMP5]], align 2 ; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF8UF2: [[PRED_STORE_CONTINUE]]: ; VF8UF2-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1 -; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] -; VF8UF2: [[PRED_STORE_IF6]]: -; VF8UF2-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] +; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF8UF2: [[PRED_STORE_IF1]]: +; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF8UF2-NEXT: store i16 0, ptr [[TMP7]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; VF8UF2: [[PRED_STORE_CONTINUE7]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF8UF2: [[PRED_STORE_CONTINUE2]]: ; VF8UF2-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 -; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] -; VF8UF2: [[PRED_STORE_IF8]]: -; VF8UF2-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] +; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF8UF2: [[PRED_STORE_IF3]]: +; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4 ; VF8UF2-NEXT: store i16 0, ptr [[TMP9]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE9]] -; VF8UF2: [[PRED_STORE_CONTINUE9]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VF8UF2: [[PRED_STORE_CONTINUE4]]: ; VF8UF2-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 -; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] -; VF8UF2: [[PRED_STORE_IF10]]: -; VF8UF2-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]] +; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF8UF2: [[PRED_STORE_IF5]]: +; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5 ; VF8UF2-NEXT: store i16 0, ptr [[TMP11]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE11]] -; VF8UF2: [[PRED_STORE_CONTINUE11]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; VF8UF2: [[PRED_STORE_CONTINUE6]]: ; VF8UF2-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 -; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] -; VF8UF2: [[PRED_STORE_IF12]]: -; VF8UF2-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]] +; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF8UF2: [[PRED_STORE_IF7]]: +; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6 ; VF8UF2-NEXT: store i16 0, ptr [[TMP13]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE13]] -; VF8UF2: [[PRED_STORE_CONTINUE13]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF8UF2: [[PRED_STORE_CONTINUE8]]: ; VF8UF2-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 -; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] -; VF8UF2: [[PRED_STORE_IF14]]: -; VF8UF2-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] +; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF8UF2: [[PRED_STORE_IF9]]: +; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7 ; VF8UF2-NEXT: store i16 0, ptr [[TMP15]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE15]] -; VF8UF2: [[PRED_STORE_CONTINUE15]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF8UF2: [[PRED_STORE_CONTINUE10]]: ; VF8UF2-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 -; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]] -; VF8UF2: [[PRED_STORE_IF16]]: -; VF8UF2-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] +; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF8UF2: [[PRED_STORE_IF11]]: +; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8 ; VF8UF2-NEXT: store i16 0, ptr [[TMP17]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE17]] -; VF8UF2: [[PRED_STORE_CONTINUE17]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF8UF2: [[PRED_STORE_CONTINUE12]]: ; VF8UF2-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 -; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] -; VF8UF2: [[PRED_STORE_IF18]]: -; VF8UF2-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]] +; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF8UF2: [[PRED_STORE_IF13]]: +; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9 ; VF8UF2-NEXT: store i16 0, ptr [[TMP19]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE19]] -; VF8UF2: [[PRED_STORE_CONTINUE19]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; VF8UF2: [[PRED_STORE_CONTINUE14]]: ; VF8UF2-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]] -; VF8UF2: [[PRED_STORE_IF20]]: -; VF8UF2-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 8 -; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]] +; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; VF8UF2: [[PRED_STORE_IF15]]: +; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10 ; VF8UF2-NEXT: store i16 0, ptr [[TMP21]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE21]] -; VF8UF2: [[PRED_STORE_CONTINUE21]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; VF8UF2: [[PRED_STORE_CONTINUE16]]: ; VF8UF2-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 -; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]] -; VF8UF2: [[PRED_STORE_IF22]]: -; VF8UF2-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 9 -; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]] +; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; VF8UF2: [[PRED_STORE_IF17]]: +; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11 ; VF8UF2-NEXT: store i16 0, ptr [[TMP23]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE23]] -; VF8UF2: [[PRED_STORE_CONTINUE23]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; VF8UF2: [[PRED_STORE_CONTINUE18]]: ; VF8UF2-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 -; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]] -; VF8UF2: [[PRED_STORE_IF24]]: -; VF8UF2-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 10 -; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP51]] +; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; VF8UF2: [[PRED_STORE_IF19]]: +; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12 ; VF8UF2-NEXT: store i16 0, ptr [[TMP25]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE25]] -; VF8UF2: [[PRED_STORE_CONTINUE25]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; VF8UF2: [[PRED_STORE_CONTINUE20]]: ; VF8UF2-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 -; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]] -; VF8UF2: [[PRED_STORE_IF26]]: -; VF8UF2-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 11 -; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]] +; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; VF8UF2: [[PRED_STORE_IF21]]: +; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13 ; VF8UF2-NEXT: store i16 0, ptr [[TMP27]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE27]] -; VF8UF2: [[PRED_STORE_CONTINUE27]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; VF8UF2: [[PRED_STORE_CONTINUE22]]: ; VF8UF2-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 -; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] -; VF8UF2: [[PRED_STORE_IF28]]: -; VF8UF2-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12 -; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] +; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; VF8UF2: [[PRED_STORE_IF23]]: +; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14 ; VF8UF2-NEXT: store i16 0, ptr [[TMP29]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE29]] -; VF8UF2: [[PRED_STORE_CONTINUE29]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; VF8UF2: [[PRED_STORE_CONTINUE24]]: ; VF8UF2-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 -; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] -; VF8UF2: [[PRED_STORE_IF30]]: -; VF8UF2-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 13 -; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]] +; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; VF8UF2: [[PRED_STORE_IF25]]: +; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15 ; VF8UF2-NEXT: store i16 0, ptr [[TMP31]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE31]] -; VF8UF2: [[PRED_STORE_CONTINUE31]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; VF8UF2: [[PRED_STORE_CONTINUE26]]: ; VF8UF2-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 -; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] -; VF8UF2: [[PRED_STORE_IF32]]: -; VF8UF2-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14 -; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] +; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; VF8UF2: [[PRED_STORE_IF27]]: +; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16 ; VF8UF2-NEXT: store i16 0, ptr [[TMP33]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE33]] -; VF8UF2: [[PRED_STORE_CONTINUE33]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; VF8UF2: [[PRED_STORE_CONTINUE28]]: ; VF8UF2-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 -; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]] -; VF8UF2: [[PRED_STORE_IF34]]: -; VF8UF2-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 15 -; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]] +; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; VF8UF2: [[PRED_STORE_IF29]]: +; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17 ; VF8UF2-NEXT: store i16 0, ptr [[TMP35]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; VF8UF2: [[PRED_STORE_CONTINUE35]]: -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; VF8UF2: [[PRED_STORE_CONTINUE30]]: +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF2: [[SCALAR_PH]]: @@ -423,7 +383,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF8UF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: ret void ; @@ -438,146 +398,124 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF16UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF16UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ] -; VF16UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer -; VF16UF1-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT1]], -; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> , [[BROADCAST_SPLAT1]] ; VF16UF1-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0 ; VF16UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF16UF1: [[PRED_STORE_IF]]: -; VF16UF1-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]] +; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF16UF1-NEXT: store i16 0, ptr [[TMP4]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF16UF1: [[PRED_STORE_CONTINUE]]: ; VF16UF1-NEXT: [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1 -; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; VF16UF1: [[PRED_STORE_IF3]]: -; VF16UF1-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]] +; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF16UF1: [[PRED_STORE_IF1]]: +; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF16UF1-NEXT: store i16 0, ptr [[TMP6]], align 2 +; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF16UF1: [[PRED_STORE_CONTINUE2]]: +; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2 +; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF16UF1: [[PRED_STORE_IF3]]: +; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4 +; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; VF16UF1: [[PRED_STORE_CONTINUE4]]: -; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2 -; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3 +; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; VF16UF1: [[PRED_STORE_IF5]]: -; VF16UF1-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 +; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5 +; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; VF16UF1: [[PRED_STORE_CONTINUE6]]: -; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3 -; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4 +; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; VF16UF1: [[PRED_STORE_IF7]]: -; VF16UF1-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 +; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6 +; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; VF16UF1: [[PRED_STORE_CONTINUE8]]: -; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4 -; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5 +; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; VF16UF1: [[PRED_STORE_IF9]]: -; VF16UF1-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 +; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7 +; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; VF16UF1: [[PRED_STORE_CONTINUE10]]: -; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5 -; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6 +; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; VF16UF1: [[PRED_STORE_IF11]]: -; VF16UF1-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 +; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8 +; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; VF16UF1: [[PRED_STORE_CONTINUE12]]: -; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6 -; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7 +; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; VF16UF1: [[PRED_STORE_IF13]]: -; VF16UF1-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 +; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9 +; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; VF16UF1: [[PRED_STORE_CONTINUE14]]: -; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7 -; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8 +; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] ; VF16UF1: [[PRED_STORE_IF15]]: -; VF16UF1-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 +; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10 +; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]] ; VF16UF1: [[PRED_STORE_CONTINUE16]]: -; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8 -; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9 +; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] ; VF16UF1: [[PRED_STORE_IF17]]: -; VF16UF1-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 8 -; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2 +; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11 +; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE18]] ; VF16UF1: [[PRED_STORE_CONTINUE18]]: -; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9 -; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10 +; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] ; VF16UF1: [[PRED_STORE_IF19]]: -; VF16UF1-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 9 -; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2 +; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12 +; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE20]] ; VF16UF1: [[PRED_STORE_CONTINUE20]]: -; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10 -; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11 +; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] ; VF16UF1: [[PRED_STORE_IF21]]: -; VF16UF1-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 10 -; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2 +; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13 +; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE22]] ; VF16UF1: [[PRED_STORE_CONTINUE22]]: -; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11 -; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12 +; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] ; VF16UF1: [[PRED_STORE_IF23]]: -; VF16UF1-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 11 -; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2 +; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14 +; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE24]] ; VF16UF1: [[PRED_STORE_CONTINUE24]]: -; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12 -; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13 +; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] ; VF16UF1: [[PRED_STORE_IF25]]: -; VF16UF1-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 12 -; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2 +; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15 +; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE26]] ; VF16UF1: [[PRED_STORE_CONTINUE26]]: -; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13 -; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14 +; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] ; VF16UF1: [[PRED_STORE_IF27]]: -; VF16UF1-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 13 -; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2 +; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16 +; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE28]] ; VF16UF1: [[PRED_STORE_CONTINUE28]]: -; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14 -; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15 +; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] ; VF16UF1: [[PRED_STORE_IF29]]: -; VF16UF1-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 14 -; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2 +; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17 +; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE30]] ; VF16UF1: [[PRED_STORE_CONTINUE30]]: -; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15 -; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]] -; VF16UF1: [[PRED_STORE_IF31]]: -; VF16UF1-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 15 -; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2 -; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; VF16UF1: [[PRED_STORE_CONTINUE32]]: -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF16UF1: [[SCALAR_PH]]: @@ -589,7 +527,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF16UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: ret void ; @@ -633,7 +571,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF1-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1 ; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VF8UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VF8UF1: [[MIDDLE_BLOCK]]: ; VF8UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF8UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -648,7 +586,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF8UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]] ; VF8UF1: [[OUTER_LATCH]]: ; VF8UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF8UF1-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -669,20 +607,17 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP6]] +; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0 ; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0 ; VF8UF2-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 ; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8 ; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1 ; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1 -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -697,7 +632,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF2-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF8UF2-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]] ; VF8UF2: [[OUTER_LATCH]]: ; VF8UF2-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF8UF2-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -718,16 +653,13 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP4]] +; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0 ; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] +; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0 ; VF16UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0 ; VF16UF1-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1 -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -742,7 +674,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF16UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF16UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]] ; VF16UF1: [[OUTER_LATCH]]: ; VF16UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF16UF1-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -780,28 +712,19 @@ exit: ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} ;. ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; VF8UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; VF8UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; VF16UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; VF16UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 930d3cd41d31d..791c995d88c14 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -499,16 +499,13 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]] -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[G_64]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 0, [[G_64]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 0, [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3 ; CHECK-NEXT: store <4 x i16> splat (i16 -1), ptr [[TMP7]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -521,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: store i16 [[G_16]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -558,6 +555,5 @@ exit: ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index d70c874499cb7..beb305f23884e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -61,7 +61,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; ; CHECK: Executing best plan with VF=8, UF=2 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' { -; CHECK-NEXT: Live-in ir<[[VFxUF:.+]]> = VF * UF ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -75,27 +74,21 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: IR %n.vec = sub i64 %and, %n.mod.vf ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1> -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> -; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> -; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> -; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> -; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> -; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> -; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> -; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> -; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> -; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> -; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, ir<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; CHECK-NEXT: vector.body: +; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1> +; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> +; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> +; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> +; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> +; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> +; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> +; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: From b5f21671ef04984bc00770263234dfb94833a274 Mon Sep 17 00:00:00 2001 From: William Moses Date: Sun, 5 Jan 2025 11:02:49 -0500 Subject: [PATCH 198/480] MLIR: Enable importing inlineasm calls (#121624) --- .../include/mlir/Target/LLVMIR/ModuleImport.h | 6 +- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 109 ++++++++++-------- .../Target/LLVMIR/Import/import-failure.ll | 9 -- .../test/Target/LLVMIR/Import/instructions.ll | 11 ++ 4 files changed, 79 insertions(+), 56 deletions(-) diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index eea0647895b01..33c9af7c6335a 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -319,9 +319,13 @@ class ModuleImport { /// Appends the converted result type and operands of `callInst` to the /// `types` and `operands` arrays. For indirect calls, the method additionally /// inserts the called function at the beginning of the `operands` array. + /// If `allowInlineAsm` is set to false (the default), it will return failure + /// if the called operand is an inline asm which isn't convertible to MLIR as + /// a value. LogicalResult convertCallTypeAndOperands(llvm::CallBase *callInst, SmallVectorImpl &types, - SmallVectorImpl &operands); + SmallVectorImpl &operands, + bool allowInlineAsm = false); /// Converts the parameter attributes attached to `func` and adds them to the /// `funcOp`. void convertParameterAttributes(llvm::Function *func, LLVMFuncOp funcOp, diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index b0d5e635248d3..95fb673fc72e3 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1473,18 +1473,20 @@ ModuleImport::convertBranchArgs(llvm::Instruction *branch, return success(); } -LogicalResult -ModuleImport::convertCallTypeAndOperands(llvm::CallBase *callInst, - SmallVectorImpl &types, - SmallVectorImpl &operands) { +LogicalResult ModuleImport::convertCallTypeAndOperands( + llvm::CallBase *callInst, SmallVectorImpl &types, + SmallVectorImpl &operands, bool allowInlineAsm) { if (!callInst->getType()->isVoidTy()) types.push_back(convertType(callInst->getType())); if (!callInst->getCalledFunction()) { - FailureOr called = convertValue(callInst->getCalledOperand()); - if (failed(called)) - return failure(); - operands.push_back(*called); + if (!allowInlineAsm || + !isa(callInst->getCalledOperand())) { + FailureOr called = convertValue(callInst->getCalledOperand()); + if (failed(called)) + return failure(); + operands.push_back(*called); + } } SmallVector args(callInst->args()); FailureOr> arguments = convertValues(args); @@ -1579,7 +1581,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { SmallVector types; SmallVector operands; - if (failed(convertCallTypeAndOperands(callInst, types, operands))) + if (failed(convertCallTypeAndOperands(callInst, types, operands, + /*allowInlineAsm=*/true))) return failure(); auto funcTy = @@ -1587,45 +1590,59 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { if (!funcTy) return failure(); - CallOp callOp; - - if (llvm::Function *callee = callInst->getCalledFunction()) { - callOp = builder.create( - loc, funcTy, SymbolRefAttr::get(context, callee->getName()), - operands); + if (auto asmI = dyn_cast(callInst->getCalledOperand())) { + auto callOp = builder.create( + loc, funcTy.getReturnType(), operands, + builder.getStringAttr(asmI->getAsmString()), + builder.getStringAttr(asmI->getConstraintString()), + /*has_side_effects=*/true, + /*is_align_stack=*/false, /*asm_dialect=*/nullptr, + /*operand_attrs=*/nullptr); + if (!callInst->getType()->isVoidTy()) + mapValue(inst, callOp.getResult(0)); + else + mapNoResultOp(inst, callOp); } else { - callOp = builder.create(loc, funcTy, operands); + CallOp callOp; + + if (llvm::Function *callee = callInst->getCalledFunction()) { + callOp = builder.create( + loc, funcTy, SymbolRefAttr::get(context, callee->getName()), + operands); + } else { + callOp = builder.create(loc, funcTy, operands); + } + callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv())); + callOp.setTailCallKind( + convertTailCallKindFromLLVM(callInst->getTailCallKind())); + setFastmathFlagsAttr(inst, callOp); + + // Handle function attributes. + if (callInst->hasFnAttr(llvm::Attribute::Convergent)) + callOp.setConvergent(true); + if (callInst->hasFnAttr(llvm::Attribute::NoUnwind)) + callOp.setNoUnwind(true); + if (callInst->hasFnAttr(llvm::Attribute::WillReturn)) + callOp.setWillReturn(true); + + llvm::MemoryEffects memEffects = callInst->getMemoryEffects(); + ModRefInfo othermem = convertModRefInfoFromLLVM( + memEffects.getModRef(llvm::MemoryEffects::Location::Other)); + ModRefInfo argMem = convertModRefInfoFromLLVM( + memEffects.getModRef(llvm::MemoryEffects::Location::ArgMem)); + ModRefInfo inaccessibleMem = convertModRefInfoFromLLVM( + memEffects.getModRef(llvm::MemoryEffects::Location::InaccessibleMem)); + auto memAttr = MemoryEffectsAttr::get(callOp.getContext(), othermem, + argMem, inaccessibleMem); + // Only set the attribute when it does not match the default value. + if (!memAttr.isReadWrite()) + callOp.setMemoryEffectsAttr(memAttr); + + if (!callInst->getType()->isVoidTy()) + mapValue(inst, callOp.getResult()); + else + mapNoResultOp(inst, callOp); } - callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv())); - callOp.setTailCallKind( - convertTailCallKindFromLLVM(callInst->getTailCallKind())); - setFastmathFlagsAttr(inst, callOp); - - // Handle function attributes. - if (callInst->hasFnAttr(llvm::Attribute::Convergent)) - callOp.setConvergent(true); - if (callInst->hasFnAttr(llvm::Attribute::NoUnwind)) - callOp.setNoUnwind(true); - if (callInst->hasFnAttr(llvm::Attribute::WillReturn)) - callOp.setWillReturn(true); - - llvm::MemoryEffects memEffects = callInst->getMemoryEffects(); - ModRefInfo othermem = convertModRefInfoFromLLVM( - memEffects.getModRef(llvm::MemoryEffects::Location::Other)); - ModRefInfo argMem = convertModRefInfoFromLLVM( - memEffects.getModRef(llvm::MemoryEffects::Location::ArgMem)); - ModRefInfo inaccessibleMem = convertModRefInfoFromLLVM( - memEffects.getModRef(llvm::MemoryEffects::Location::InaccessibleMem)); - auto memAttr = MemoryEffectsAttr::get(callOp.getContext(), othermem, argMem, - inaccessibleMem); - // Only set the attribute when it does not match the default value. - if (!memAttr.isReadWrite()) - callOp.setMemoryEffectsAttr(memAttr); - - if (!callInst->getType()->isVoidTy()) - mapValue(inst, callOp.getResult()); - else - mapNoResultOp(inst, callOp); return success(); } if (inst->getOpcode() == llvm::Instruction::LandingPad) { diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll index 6bde174642d54..b616cb81e0a8a 100644 --- a/mlir/test/Target/LLVMIR/Import/import-failure.ll +++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll @@ -12,15 +12,6 @@ bb2: ; // ----- -; CHECK: -; CHECK-SAME: error: unhandled value: ptr asm "bswap $0", "=r,r" -define i32 @unhandled_value(i32 %arg1) { - %1 = call i32 asm "bswap $0", "=r,r"(i32 %arg1) - ret i32 %1 -} - -; // ----- - ; CHECK: ; CHECK-SAME: unhandled constant: ptr blockaddress(@unhandled_constant, %bb1) since blockaddress(...) is unsupported ; CHECK: diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll index fff48bbc486bc..7377e2584110b 100644 --- a/mlir/test/Target/LLVMIR/Import/instructions.ll +++ b/mlir/test/Target/LLVMIR/Import/instructions.ll @@ -535,6 +535,17 @@ define void @indirect_vararg_call(ptr addrspace(42) %fn) { ; // ----- +; CHECK-LABEL: @inlineasm +; CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +define i32 @inlineasm(i32 %arg1) { + ; CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects "bswap $0", "=r,r" %[[ARG1]] : (i32) -> i32 + %1 = call i32 asm "bswap $0", "=r,r"(i32 %arg1) + ; CHECK: return %[[RES]] + ret i32 %1 +} + +; // ----- + ; CHECK-LABEL: @gep_static_idx ; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]] define void @gep_static_idx(ptr %ptr) { From a37dbc1f51c70d92fd209c2e52a9d794eb15e4e7 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Mon, 6 Jan 2025 00:04:28 +0800 Subject: [PATCH 199/480] [InstCombine] Drop noundef in `foldSelectCttzCtlz` (#121692) Close https://github.com/llvm/llvm-project/issues/121428 --- .../Transforms/InstCombine/InstCombineSelect.cpp | 6 +++++- .../Transforms/InstCombine/select-cmp-cttz-ctlz.ll | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index e7a8e947705f8..a18b927678efd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1225,8 +1225,12 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, // zext/trunc) have one use (ending at the select), the cttz/ctlz result will // not be used if the input is zero. Relax to 'zero is poison' for that case. if (II->hasOneUse() && SelectArg->hasOneUse() && - !match(II->getArgOperand(1), m_One())) + !match(II->getArgOperand(1), m_One())) { II->setArgOperand(1, ConstantInt::getTrue(II->getContext())); + // noundef attribute on the intrinsic may no longer be valid. + II->dropUBImplyingAttrsAndMetadata(); + IC.addToWorklist(II); + } return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll index 35b4087d767a7..2cb70e85f435f 100644 --- a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll +++ b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll @@ -495,6 +495,19 @@ define i32 @test_cttz_not_bw(i32 %x) { ret i32 %res } +define i32 @test_cttz_not_bw_noundef(i32 %x) { +; CHECK-LABEL: @test_cttz_not_bw_noundef( +; CHECK-NEXT: [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP_NOT]], i32 123, i32 [[CT]] +; CHECK-NEXT: ret i32 [[RES]] +; + %ct = tail call noundef i32 @llvm.cttz.i32(i32 %x, i1 false) + %cmp = icmp ne i32 %x, 0 + %res = select i1 %cmp, i32 %ct, i32 123 + ret i32 %res +} + define i32 @test_cttz_not_bw_multiuse(i32 %x) { ; CHECK-LABEL: @test_cttz_not_bw_multiuse( ; CHECK-NEXT: [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false) From b48e5f0ff3f25e8bdd3ae473dca00511336cbd6f Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Sun, 5 Jan 2025 11:19:08 -0500 Subject: [PATCH 200/480] [RISCV][VLOPT] Add Vector FP instructions to getOperandInfo (#121609) Although we cannot reduce the VL of these instructions (i.e. add to isSupported) we can add them to getOperandInfo to enable optimization where the FP vector instruction are users. Most of the instructions are covered by existing tests, and I added tests for the narrowing conversions because I was a little unsure whether the dest or the source was 2*SEW and 2*LMUL. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 103 +++++++++++++++++- llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll | 42 +++---- llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll | 36 ++---- llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll | 45 +++----- llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll | 45 +++----- .../test/CodeGen/RISCV/rvv/vl-opt-op-info.mir | 30 +++++ llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll | 3 +- llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll | 3 +- 8 files changed, 187 insertions(+), 120 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 85ea5a23e8f29..0ddfcd8620336 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -456,6 +456,49 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VCOMPRESS_VM: // Vector Element Index Instruction case RISCV::VID_V: + // Vector Single-Width Floating-Point Add/Subtract Instructions + case RISCV::VFADD_VF: + case RISCV::VFADD_VV: + case RISCV::VFSUB_VF: + case RISCV::VFSUB_VV: + case RISCV::VFRSUB_VF: + // Vector Single-Width Floating-Point Multiply/Divide Instructions + case RISCV::VFMUL_VF: + case RISCV::VFMUL_VV: + case RISCV::VFDIV_VF: + case RISCV::VFDIV_VV: + case RISCV::VFRDIV_VF: + // Vector Floating-Point Square-Root Instruction + case RISCV::VFSQRT_V: + // Vector Floating-Point Reciprocal Square-Root Estimate Instruction + case RISCV::VFRSQRT7_V: + // Vector Floating-Point Reciprocal Estimate Instruction + case RISCV::VFREC7_V: + // Vector Floating-Point MIN/MAX Instructions + case RISCV::VFMIN_VF: + case RISCV::VFMIN_VV: + case RISCV::VFMAX_VF: + case RISCV::VFMAX_VV: + // Vector Floating-Point Sign-Injection Instructions + case RISCV::VFSGNJ_VF: + case RISCV::VFSGNJ_VV: + case RISCV::VFSGNJN_VV: + case RISCV::VFSGNJN_VF: + case RISCV::VFSGNJX_VF: + case RISCV::VFSGNJX_VV: + // Vector Floating-Point Classify Instruction + case RISCV::VFCLASS_V: + // Vector Floating-Point Move Instruction + case RISCV::VFMV_V_F: + // Single-Width Floating-Point/Integer Type-Convert Instructions + case RISCV::VFCVT_XU_F_V: + case RISCV::VFCVT_X_F_V: + case RISCV::VFCVT_RTZ_XU_F_V: + case RISCV::VFCVT_RTZ_X_F_V: + case RISCV::VFCVT_F_XU_V: + case RISCV::VFCVT_F_X_V: + // Vector Floating-Point Merge Instruction + case RISCV::VFMERGE_VFM: return OperandInfo(MIVLMul, MILog2SEW); // Vector Widening Integer Add/Subtract @@ -488,7 +531,33 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VWMACC_VX: case RISCV::VWMACCSU_VV: case RISCV::VWMACCSU_VX: - case RISCV::VWMACCUS_VX: { + case RISCV::VWMACCUS_VX: + // Vector Widening Floating-Point Fused Multiply-Add Instructions + case RISCV::VFWMACC_VF: + case RISCV::VFWMACC_VV: + case RISCV::VFWNMACC_VF: + case RISCV::VFWNMACC_VV: + case RISCV::VFWMSAC_VF: + case RISCV::VFWMSAC_VV: + case RISCV::VFWNMSAC_VF: + case RISCV::VFWNMSAC_VV: + // Vector Widening Floating-Point Add/Subtract Instructions + // Dest EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL. + case RISCV::VFWADD_VV: + case RISCV::VFWADD_VF: + case RISCV::VFWSUB_VV: + case RISCV::VFWSUB_VF: + // Vector Widening Floating-Point Multiply + case RISCV::VFWMUL_VF: + case RISCV::VFWMUL_VV: + // Widening Floating-Point/Integer Type-Convert Instructions + case RISCV::VFWCVT_XU_F_V: + case RISCV::VFWCVT_X_F_V: + case RISCV::VFWCVT_RTZ_XU_F_V: + case RISCV::VFWCVT_RTZ_X_F_V: + case RISCV::VFWCVT_F_XU_V: + case RISCV::VFWCVT_F_X_V: + case RISCV::VFWCVT_F_F_V: { unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW; RISCVII::VLMUL EMUL = IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul; @@ -503,7 +572,12 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VWADD_WV: case RISCV::VWADD_WX: case RISCV::VWSUB_WV: - case RISCV::VWSUB_WX: { + case RISCV::VWSUB_WX: + // Vector Widening Floating-Point Add/Subtract Instructions + case RISCV::VFWADD_WF: + case RISCV::VFWADD_WV: + case RISCV::VFWSUB_WF: + case RISCV::VFWSUB_WV: { bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1; bool TwoTimes = IsMODef || IsOp1; unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW; @@ -539,7 +613,16 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VNCLIPU_WX: case RISCV::VNCLIP_WI: case RISCV::VNCLIP_WV: - case RISCV::VNCLIP_WX: { + case RISCV::VNCLIP_WX: + // Narrowing Floating-Point/Integer Type-Convert Instructions + case RISCV::VFNCVT_XU_F_W: + case RISCV::VFNCVT_X_F_W: + case RISCV::VFNCVT_RTZ_XU_F_W: + case RISCV::VFNCVT_RTZ_X_F_W: + case RISCV::VFNCVT_F_XU_W: + case RISCV::VFNCVT_F_X_W: + case RISCV::VFNCVT_F_F_W: + case RISCV::VFNCVT_ROD_F_F_W: { bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1; bool TwoTimes = IsOp1; unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW; @@ -615,7 +698,19 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VMADC_VI: case RISCV::VMADC_VX: case RISCV::VMSBC_VV: - case RISCV::VMSBC_VX: { + case RISCV::VMSBC_VX: + // 13.13. Vector Floating-Point Compare Instructions + // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW EMUL=LMUL. + case RISCV::VMFEQ_VF: + case RISCV::VMFEQ_VV: + case RISCV::VMFNE_VF: + case RISCV::VMFNE_VV: + case RISCV::VMFLT_VF: + case RISCV::VMFLT_VV: + case RISCV::VMFLE_VF: + case RISCV::VMFLE_VV: + case RISCV::VMFGT_VF: + case RISCV::VMFGE_VF: { if (IsMODef) return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0); return OperandInfo(MIVLMul, MILog2SEW); diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll index 6cd3884f029fd..a1d548e1878b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll @@ -143,9 +143,8 @@ define @vfmacc_vf_nxv1f32( %va, half %b, ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -170,9 +169,8 @@ define @vfmacc_vf_nxv1f32_commute( %va, ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -198,9 +196,8 @@ define @vfmacc_vf_nxv1f32_unmasked( %va, ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -225,9 +222,8 @@ define @vfmacc_vf_nxv1f32_tu( %va, half ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_tu: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, tu, mu @@ -254,9 +250,8 @@ define @vfmacc_vf_nxv1f32_commute_tu( %v ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_commute_tu: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, tu, mu @@ -283,9 +278,8 @@ define @vfmacc_vf_nxv1f32_unmasked_tu( % ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_unmasked_tu: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, tu, ma @@ -362,9 +356,8 @@ define @vfmacc_vf_nxv2f32( %va, half %b, ; ZVFHMIN-LABEL: vfmacc_vf_nxv2f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -389,9 +382,8 @@ define @vfmacc_vf_nxv2f32_unmasked( %va, ; ZVFHMIN-LABEL: vfmacc_vf_nxv2f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -468,9 +460,8 @@ define @vfmacc_vf_nxv4f32( %va, half %b, ; ZVFHMIN-LABEL: vfmacc_vf_nxv4f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -495,9 +486,8 @@ define @vfmacc_vf_nxv4f32_unmasked( %va, ; ZVFHMIN-LABEL: vfmacc_vf_nxv4f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -574,9 +564,8 @@ define @vfmacc_vf_nxv8f32( %va, half %b, ; ZVFHMIN-LABEL: vfmacc_vf_nxv8f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -601,9 +590,8 @@ define @vfmacc_vf_nxv8f32_unmasked( %va, ; ZVFHMIN-LABEL: vfmacc_vf_nxv8f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -696,9 +684,8 @@ define @vfmacc_vf_nxv16f32( %va, half ; ZVFHMIN-LABEL: vfmacc_vf_nxv16f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -723,9 +710,8 @@ define @vfmacc_vf_nxv16f32_unmasked( % ; ZVFHMIN-LABEL: vfmacc_vf_nxv16f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll index c92a79e49c164..94b80075ac14c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll @@ -120,9 +120,8 @@ define @vmfsac_vf_nxv1f32( %a, half %b, ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -148,9 +147,8 @@ define @vmfsac_vf_nxv1f32_commute( %a, h ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -177,9 +175,8 @@ define @vmfsac_vf_nxv1f32_unmasked( %a, ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -255,9 +252,8 @@ define @vmfsac_vf_nxv2f32( %a, half %b, ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -283,9 +279,8 @@ define @vmfsac_vf_nxv2f32_commute( %a, h ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -312,9 +307,8 @@ define @vmfsac_vf_nxv2f32_unmasked( %a, ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -392,9 +386,8 @@ define @vmfsac_vf_nxv4f32( %a, half %b, ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -420,9 +413,8 @@ define @vmfsac_vf_nxv4f32_commute( %a, h ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -449,9 +441,8 @@ define @vmfsac_vf_nxv4f32_unmasked( %a, ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -529,9 +520,8 @@ define @vmfsac_vf_nxv8f32( %a, half %b, ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -557,9 +547,8 @@ define @vmfsac_vf_nxv8f32_commute( %a, h ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -586,9 +575,8 @@ define @vmfsac_vf_nxv8f32_unmasked( %a, ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll index 0a0bc6696a9f9..ea457069bcdf1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll @@ -71,9 +71,8 @@ define @vfnmacc_vf_nxv1f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -101,9 +100,8 @@ define @vfnmacc_vf_nxv1f32_commute( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -131,9 +129,8 @@ define @vfnmacc_vf_nxv1f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -212,9 +209,8 @@ define @vfnmacc_vf_nxv2f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -242,9 +238,8 @@ define @vfnmacc_vf_nxv2f32_commute( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -272,9 +267,8 @@ define @vfnmacc_vf_nxv2f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -355,9 +349,8 @@ define @vfnmacc_vf_nxv4f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -385,9 +378,8 @@ define @vfnmacc_vf_nxv4f32_commute( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -415,9 +407,8 @@ define @vfnmacc_vf_nxv4f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -498,9 +489,8 @@ define @vfnmacc_vf_nxv8f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -528,9 +518,8 @@ define @vfnmacc_vf_nxv8f32_commute( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -558,9 +547,8 @@ define @vfnmacc_vf_nxv8f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -657,9 +645,8 @@ define @vfnmacc_vf_nxv16f32( %a, half ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -687,9 +674,8 @@ define @vfnmacc_vf_nxv16f32_commute( % ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -717,9 +703,8 @@ define @vfnmacc_vf_nxv16f32_unmasked( ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll index b5f7ef3380869..4956da531c8ca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll @@ -69,9 +69,8 @@ define @vfnmsac_vf_nxv1f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -97,9 +96,8 @@ define @vfnmsac_vf_nxv1f32_commute( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -126,9 +124,8 @@ define @vfnmsac_vf_nxv1f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -204,9 +201,8 @@ define @vfnmsac_vf_nxv2f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -232,9 +228,8 @@ define @vfnmsac_vf_nxv2f32_commute( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v11, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -261,9 +256,8 @@ define @vfnmsac_vf_nxv2f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -341,9 +335,8 @@ define @vfnmsac_vf_nxv4f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -369,9 +362,8 @@ define @vfnmsac_vf_nxv4f32_commute( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -398,9 +390,8 @@ define @vfnmsac_vf_nxv4f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -478,9 +469,8 @@ define @vfnmsac_vf_nxv8f32( %a, half %b, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -506,9 +496,8 @@ define @vfnmsac_vf_nxv8f32_commute( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -535,9 +524,8 @@ define @vfnmsac_vf_nxv8f32_unmasked( %a, ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -631,9 +619,8 @@ define @vfnmsac_vf_nxv16f32( %a, half ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -659,9 +646,8 @@ define @vfnmsac_vf_nxv16f32_commute( % ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_commute: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -688,9 +674,8 @@ define @vfnmsac_vf_nxv16f32_unmasked( ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir index b304769b27731..f9b81863d68d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir @@ -513,6 +513,36 @@ body: | %y:vr = PseudoVNSRL_WV_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 ... --- +name: vfnop_vs2 +body: | + bb.0: + ; CHECK-LABEL: name: vfnop_vs2 + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 +... +--- +name: vfnop_vs2_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: vfnop_vs2_incompatible_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 4 /* e16 */, 0 +... +--- +name: vfnop_vs2_incompatible_emul +body: | + bb.0: + ; CHECK-LABEL: name: vfnop_vs2_incompatible_emul + ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 +... +--- name: vseN_v body: | bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll index 001f744503523..c041a165a594f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll @@ -131,10 +131,9 @@ declare @llvm.vp.sitofp.nxv2f16.nxv2i7(, @vsitofp_nxv2f16_nxv2i7( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vsitofp_nxv2f16_nxv2i7: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; ZVFH-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; ZVFH-NEXT: vadd.vv v8, v8, v8 ; ZVFH-NEXT: vsra.vi v9, v8, 1 -; ZVFH-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; ZVFH-NEXT: vfwcvt.f.x.v v8, v9, v0.t ; ZVFH-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll index 06d85193e3b61..3d27a1eaf22e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll @@ -124,9 +124,8 @@ define @vuitofp_nxv2f16_nxv2i7( %va, Date: Sun, 5 Jan 2025 11:19:45 -0500 Subject: [PATCH 201/480] [RISCV][VLOPT] Add vmv.x.s and vfmv.f.s to isVectorOpUsedAsScalarOp (#121588) --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 0ddfcd8620336..32d552625a8e8 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -935,6 +935,9 @@ static bool isVectorOpUsedAsScalarOp(MachineOperand &MO) { case RISCV::VFWREDOSUM_VS: case RISCV::VFWREDUSUM_VS: return MO.getOperandNo() == 3; + case RISCV::VMV_X_S: + case RISCV::VFMV_F_S: + return MO.getOperandNo() == 1; default: return false; } From 486f83faa31ae5356523da868a557619601a0e3e Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sun, 5 Jan 2025 17:32:07 +0100 Subject: [PATCH 202/480] [mlir][Transforms][NFC] Simplify `buildUnresolvedMaterialization` implementation (#121651) The `buildUnresolvedMaterialization` implementation used to check if a materialization is necessary. A materialization is not necessary if the desired types already match the input. However, this situation can never happen: we look for mapped values with the desired type at the call sites before requesting a new unresolved materialization. The previous implementation seemed incorrect because `buildUnresolvedMaterialization` created a mapping that is never rolled back. (When in reality that code was never executed, so it is technically not incorrect.) Also fix a comment that in `findOrBuildReplacementValue` that was incorrect. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 48b8c727a7828..8296c0c468b01 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1430,13 +1430,8 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( UnrealizedConversionCastOp *castOp) { assert((!originalType || kind == MaterializationKind::Target) && "original type is valid only for target materializations"); - - // Avoid materializing an unnecessary cast. - if (TypeRange(inputs) == outputTypes) { - if (!valuesToMap.empty()) - mapping.map(std::move(valuesToMap), inputs); - return inputs; - } + assert(TypeRange(inputs) != outputTypes && + "materialization is not necessary"); // Create an unresolved materialization. We use a new OpBuilder to avoid // tracking the materialization like we do for other operations. @@ -1455,7 +1450,9 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( Value value, const TypeConverter *converter) { - // Find a replacement value with the same type. + // Try to find a replacement value with the same type in the conversion value + // mapping. This includes cached materializations. We try to reuse those + // instead of generating duplicate IR. ValueVector repl = mapping.lookupOrNull(value, value.getType()); if (!repl.empty()) return repl.front(); @@ -1489,10 +1486,6 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( // in the conversion value mapping.) The insertion point of the // materialization must be valid for all future users that may be created // later in the conversion process. - // - // Note: Instead of creating new IR, `buildUnresolvedMaterialization` may - // return an already existing, cached materialization from the conversion - // value mapping. Value castValue = buildUnresolvedMaterialization(MaterializationKind::Source, computeInsertPoint(repl), value.getLoc(), From 2dcb3b9f377de428f7d9d103c80226b9007c72a9 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sun, 5 Jan 2025 17:44:13 +0100 Subject: [PATCH 203/480] [mlir][ArmSME] Remove func patterns from vector lowering (#121640) Remove `func.call` and `func.return` patterns from `populateArmSVELegalizeForLLVMExportPatterns`. This function is called from `ConvertVectorToLLVMPass::runOnOperation`. That pass should lower only `vector` dialect ops, not `func` dialect ops. These patterns also seem to be unnecessary, as no test cases are failing without them. Also note that there is no `func.func` pattern, so any application of the above-mentioned patterns produces invalid IR. --- .../Transforms/LegalizeForLLVMExport.cpp | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp index 845a32c4d97b5..2bdb640699d03 100644 --- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp @@ -20,22 +20,6 @@ using namespace mlir; using namespace mlir::arm_sve; -template -class ForwardOperands : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - if (adaptor.getOperands().getTypes() == op->getOperands().getTypes()) - return rewriter.notifyMatchFailure(op, "operand types already match"); - - rewriter.modifyOpInPlace(op, - [&]() { op->setOperands(adaptor.getOperands()); }); - return success(); - } -}; - using SdotOpLowering = OneToOneConvertToLLVMPattern; using SmmlaOpLowering = OneToOneConvertToLLVMPattern; using UdotOpLowering = OneToOneConvertToLLVMPattern; @@ -204,10 +188,6 @@ void mlir::populateArmSVELegalizeForLLVMExportPatterns( // Populate conversion patterns // clang-format off - patterns.add, - ForwardOperands, - ForwardOperands>(converter, - &converter.getContext()); patterns.add Date: Sun, 5 Jan 2025 19:42:14 +0300 Subject: [PATCH 204/480] [clang][NFC] Add end namespace comments to C++ DR tests --- clang/test/CXX/drs/cwg0xx.cpp | 156 ++++++++++++++++----------------- clang/test/CXX/drs/cwg10xx.cpp | 12 +-- clang/test/CXX/drs/cwg11xx.cpp | 2 +- clang/test/CXX/drs/cwg12xx.cpp | 18 ++-- clang/test/CXX/drs/cwg13xx.cpp | 32 +++---- clang/test/CXX/drs/cwg14xx.cpp | 22 ++--- clang/test/CXX/drs/cwg15xx.cpp | 26 +++--- clang/test/CXX/drs/cwg16xx.cpp | 28 +++--- clang/test/CXX/drs/cwg177x.cpp | 4 +- clang/test/CXX/drs/cwg17xx.cpp | 18 ++-- clang/test/CXX/drs/cwg1807.cpp | 2 +- clang/test/CXX/drs/cwg18xx.cpp | 20 ++--- clang/test/CXX/drs/cwg19xx.cpp | 22 ++--- clang/test/CXX/drs/cwg1xx.cpp | 130 +++++++++++++-------------- clang/test/CXX/drs/cwg20xx.cpp | 16 ++-- clang/test/CXX/drs/cwg21xx.cpp | 22 ++--- clang/test/CXX/drs/cwg22xx.cpp | 14 +-- clang/test/CXX/drs/cwg23xx.cpp | 44 +++++----- clang/test/CXX/drs/cwg24xx.cpp | 8 +- clang/test/CXX/drs/cwg2504.cpp | 2 +- clang/test/CXX/drs/cwg25xx.cpp | 25 ++---- clang/test/CXX/drs/cwg26xx.cpp | 35 ++++---- clang/test/CXX/drs/cwg27xx.cpp | 6 +- clang/test/CXX/drs/cwg28xx.cpp | 2 +- clang/test/CXX/drs/cwg29xx.cpp | 12 +-- clang/test/CXX/drs/cwg2xx.cpp | 112 +++++++++++------------ clang/test/CXX/drs/cwg3xx.cpp | 125 +++++++++++++------------- clang/test/CXX/drs/cwg492.cpp | 2 +- clang/test/CXX/drs/cwg4xx.cpp | 140 ++++++++++++++--------------- clang/test/CXX/drs/cwg571.cpp | 2 +- clang/test/CXX/drs/cwg5xx.cpp | 118 ++++++++++++------------- clang/test/CXX/drs/cwg6xx.cpp | 154 ++++++++++++++++---------------- clang/test/CXX/drs/cwg722.cpp | 2 +- clang/test/CXX/drs/cwg7xx.cpp | 8 +- clang/test/CXX/drs/cwg8xx.cpp | 2 +- clang/test/CXX/drs/cwg9xx.cpp | 8 +- 36 files changed, 669 insertions(+), 682 deletions(-) diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp index 8f7bd6532ae6d..2e2e6d4e662d6 100644 --- a/clang/test/CXX/drs/cwg0xx.cpp +++ b/clang/test/CXX/drs/cwg0xx.cpp @@ -54,7 +54,7 @@ namespace cwg1 { // cwg1: no // FIXME: This should be rejected, due to the ambiguous default argument. i(); } -} +} // namespace cwg1 namespace cwg3 { // cwg3: yes template struct A {}; @@ -63,7 +63,7 @@ namespace cwg3 { // cwg3: yes template<> struct A {}; // expected-error@-1 {{explicit specialization of 'cwg3::A' after instantiation}} // expected-note@#cwg3-f-T {{implicit instantiation first required here}} -} +} // namespace cwg3 namespace cwg4 { // cwg4: 2.8 extern "C" { @@ -74,7 +74,7 @@ namespace cwg4 { // cwg4: 2.8 // expected-error@-1 {{conflicting types for 'cwg4_g'}} // expected-note@#cwg4-g-int {{previous definition is here}} } -} +} // namespace cwg4 namespace cwg5 { // cwg5: 3.1 struct A {} a; @@ -88,7 +88,7 @@ namespace cwg5 { // cwg5: 3.1 struct D : C {}; struct E { operator D&(); } e; const C c = e; -} +} // namespace cwg5 namespace cwg7 { // cwg7: 3.4 class A { public: ~A(); }; @@ -118,7 +118,7 @@ namespace cwg7 { // cwg7: 3.4 }; S5::S5() {} } -} +} // namespace cwg7 namespace cwg8 { // cwg8: dup 45 class A { @@ -130,7 +130,7 @@ namespace cwg8 { // cwg8: dup 45 T *g(); }; A::T *A::g() { return 0; } -} +} // namespace cwg8 namespace cwg9 { // cwg9: 2.8 struct B { @@ -146,7 +146,7 @@ namespace cwg9 { // cwg9: 2.8 // expected-note@#cwg9-N {{constrained by protected inheritance here}} // expected-note@#cwg9-m {{member is declared here}} int R2() { return n.m; } -} +} // namespace cwg9 namespace cwg10 { // cwg10: dup 45 class A { @@ -154,7 +154,7 @@ namespace cwg10 { // cwg10: dup 45 A::B *p; }; }; -} +} // namespace cwg10 namespace cwg11 { // cwg11: yes template struct A : T { @@ -168,7 +168,7 @@ namespace cwg11 { // cwg11: yes }; struct X { typedef int U; }; A ax; -} +} // namespace cwg11 namespace cwg12 { // cwg12: sup 239 enum E { e }; @@ -180,7 +180,7 @@ namespace cwg12 { // cwg12: sup 239 int &b = f(e); int &c = f(1); } -} +} // namespace cwg12 namespace cwg13 { // cwg13: no extern "C" void f(int); @@ -195,7 +195,7 @@ namespace cwg13 { // cwg13: no A a2(g); int a3 = h(f); // FIXME: We should reject this. int a4 = h(g); -} +} // namespace cwg13 namespace cwg14 { // cwg14: 3.4 namespace X { extern "C" int cwg14_f(); } @@ -219,14 +219,14 @@ namespace cwg14 { // cwg14: 3.4 // expected-error@-1 {{reference to 'U' is ambiguous}} // expected-note@#cwg14-X-U {{candidate found by name lookup is 'cwg14::X::U'}} // expected-note@#cwg14-Y-U {{candidate found by name lookup is 'cwg14::Y::U'}} -} +} // namespace cwg14 namespace cwg15 { // cwg15: yes template void f(int); // #cwg15-f-decl-first template void f(int = 0); // expected-error@-1 {{default arguments cannot be added to a function template that has already been declared}} // expected-note@#cwg15-f-decl-first {{previous template declaration is here}} -} +} // namespace cwg15 namespace cwg16 { // cwg16: 2.8 class A { // #cwg16-A @@ -248,7 +248,7 @@ namespace cwg16 { // cwg16: 2.8 // expected-note@#cwg16-B {{implicitly declared private here}} } }; -} +} // namespace cwg16 namespace cwg17 { // cwg17: yes class A { @@ -261,7 +261,7 @@ namespace cwg17 { // cwg17: yes struct A::C : A { int g() { return n; } }; -} +} // namespace cwg17 // cwg18: sup 577 @@ -279,7 +279,7 @@ namespace cwg19 { // cwg19: 3.1 // expected-note@#cwg19-n {{member is declared here}} int get2() { return ((A&)c).n; } // ok, A is an accessible base of B from here }; -} +} // namespace cwg19 namespace cwg20 { // cwg20: 2.8 class X { @@ -292,7 +292,7 @@ namespace cwg20 { // cwg20: 2.8 X x = f(); // expected-error@-1 {{calling a private constructor of class 'cwg20::X'}} // expected-note@#cwg20-X-ctor {{declared private here}} -} +} // namespace cwg20 namespace cwg21 { // cwg21: 3.4 template struct A; @@ -302,14 +302,14 @@ namespace cwg21 { // cwg21: 3.4 template friend struct B; // expected-error@-1 {{default template argument not permitted on a friend template}} }; -} +} // namespace cwg21 namespace cwg22 { // cwg22: sup 481 template struct X; // expected-error@-1 {{unknown type name 'cwg22_T'}} typedef int T; template struct Y; -} +} // namespace cwg22 namespace cwg23 { // cwg23: yes template void f(T, T); // #cwg23-f-T-T @@ -318,7 +318,7 @@ namespace cwg23 { // cwg23: yes // expected-error@-1 {{call to 'f' is ambiguous}} // expected-note@#cwg23-f-T-T {{candidate function [with T = int]}} // expected-note@#cwg23-f-T-int {{candidate function [with T = int]}} -} +} // namespace cwg23 // cwg24: na @@ -355,7 +355,7 @@ namespace cwg25 { // cwg25: yes // since-cxx17-error@-2 {{different exception specifications}} j = &A::f; } -} +} // namespace cwg25 namespace cwg26 { // cwg26: yes struct A { A(A, const A & = A()); }; @@ -375,12 +375,12 @@ namespace cwg26 { // cwg26: yes // expected-error@-1 {{recursive evaluation of default argument}} // expected-note@-2 {{default argument used here}} }; -} +} // namespace cwg26 namespace cwg27 { // cwg27: yes enum E { e } n; E &m = true ? n : n; -} +} // namespace cwg27 // cwg28: na lib @@ -441,7 +441,7 @@ namespace cwg29 { // cwg29: 3.4 // expected-error@-1 {{declaration of 'cwg29_f8' has a different language linkage}} // expected-note@#cwg29-f8 {{previous declaration is here}} } -} +} // namespace cwg29 namespace cwg30 { // cwg30: sup 468 c++11 struct A { @@ -454,7 +454,7 @@ namespace cwg30 { // cwg30: sup 468 c++11 // cxx98-error@-1 {{'template' keyword outside of a template}} int z = p->template f<0>(); // cxx98-error@-1 {{'template' keyword outside of a template}} -} +} // namespace cwg30 namespace cwg31 { // cwg31: 2.8 class X { @@ -466,7 +466,7 @@ namespace cwg31 { // cwg31: 2.8 X *p = new X; // expected-error@-1 {{'operator delete' is a private member of 'cwg31::X'}} // expected-note@#cwg31-delete {{declared private here}} -} +} // namespace cwg31 // cwg32: na @@ -511,7 +511,7 @@ namespace cwg33 { // cwg33: 9 int m = Q() + X().f; // ok int n = Q() + (&(X().f)); // ok } -} +} // namespace cwg33 // cwg34: na // cwg35: dup 178 @@ -619,7 +619,7 @@ namespace example4 { // expected-note@#cwg36-E-k-first {{previous using declaration}} }; } -} +} // namespace cwg36 // cwg37: sup 475 @@ -627,7 +627,7 @@ namespace cwg38 { // cwg38: yes template struct X {}; template X operator+(X a, X b) { return a; } template X operator+(X, X); -} +} // namespace cwg38 namespace cwg39 { // cwg39: no namespace example1 { @@ -716,18 +716,18 @@ namespace cwg39 { // cwg39: no // expected-note@#cwg39-A-n {{member found by ambiguous name lookup}} #endif } -} +} // namespace cwg39 // cwg40: na namespace cwg41 { // cwg41: yes struct S f(S); -} +} // namespace cwg41 namespace cwg42 { // cwg42: yes struct A { static const int k = 0; }; struct B : A { static const int k = A::k; }; -} +} // namespace cwg42 // cwg43: na @@ -736,7 +736,7 @@ namespace cwg44 { // cwg44: sup 727 template void f(); template<> void f<0>(); }; -} +} // namespace cwg44 namespace cwg45 { // cwg45: yes class A { @@ -744,13 +744,13 @@ namespace cwg45 { // cwg45: yes class C : B {}; C c; }; -} +} // namespace cwg45 namespace cwg46 { // cwg46: yes template struct A { template struct B {}; }; template template struct A::B; // expected-error@-1 {{expected unqualified-id}} -} +} // namespace cwg46 namespace cwg47 { // cwg47: sup 329 template struct A { @@ -764,7 +764,7 @@ namespace cwg47 { // cwg47: sup 329 void f(); void g() { f(); } -} +} // namespace cwg47 namespace cwg48 { // cwg48: yes namespace { @@ -780,7 +780,7 @@ namespace cwg48 { // cwg48: yes const int &b = S::n; const int S::o; const int &c = S::o; -} +} // namespace cwg48 namespace cwg49 { // cwg49: 2.8 template struct A {}; // #cwg49-A @@ -806,7 +806,7 @@ namespace cwg49 { // cwg49: 2.8 // since-cxx17-error@#cwg49-c {{non-type template argument is not a constant expression}} // since-cxx17-note@#cwg49-c {{read of non-constexpr variable 'q' is not allowed in a constant expression}} // since-cxx17-note@#cwg49-q {{declared here}} -} +} // namespace cwg49 namespace cwg50 { // cwg50: yes struct X; // #cwg50-X @@ -818,7 +818,7 @@ namespace cwg50 { // cwg50: yes X *u = dynamic_cast(p); // expected-error@-1 {{'cwg50::X' is an incomplete type}} // expected-note@#cwg50-X {{forward declaration of 'cwg50::X'}} -} +} // namespace cwg50 namespace cwg51 { // cwg51: 2.8 struct A {}; @@ -828,7 +828,7 @@ namespace cwg51 { // cwg51: 2.8 operator B&(); } s; A &a = s; -} +} // namespace cwg51 namespace cwg52 { // cwg52: 2.8 struct A { int n; }; // #cwg52-A @@ -840,12 +840,12 @@ namespace cwg52 { // cwg52: 2.8 // expected-note@#cwg52-A {{member is declared here}} // expected-error@#cwg52-k {{cannot cast 'struct B' to its private base class 'cwg52::A'}} // expected-note@#cwg52-B {{declared private here}} -} +} // namespace cwg52 namespace cwg53 { // cwg53: yes int n = 0; enum E { e } x = static_cast(n); -} +} // namespace cwg53 namespace cwg54 { // cwg54: 2.8 struct A { int a; } a; @@ -899,12 +899,12 @@ namespace cwg54 { // cwg54: 2.8 // expected-error@-1 {{cannot cast 'cwg54::V *' to 'B *' via virtual base 'cwg54::V'}} int B::*cmbv = (int B::*)(&V::v); // expected-error@-1 {{conversion from pointer to member of class 'cwg54::V' to pointer to member of class 'B' via virtual base 'cwg54::V' is not allowed}} -} +} // namespace cwg54 namespace cwg55 { // cwg55: yes enum E { e = 5 }; static_assert(e + 1 == 6, ""); -} +} // namespace cwg55 namespace cwg56 { // cwg56: yes struct A { @@ -920,7 +920,7 @@ namespace cwg56 { // cwg56: yes // expected-error@-1 {{redefinition of 'X'}} // expected-note@#cwg56-typedef-X-X-first {{previous definition is here}} }; -} +} // namespace cwg56 namespace cwg58 { // cwg58: 3.1 // FIXME: Ideally, we should have a CodeGen test for this. @@ -931,7 +931,7 @@ namespace cwg58 { // cwg58: 3.1 static_assert(X{E1_1, E2_m1}.e1 == 1, ""); static_assert(X{E1_1, E2_m1}.e2 == -1, ""); #endif -} +} // namespace cwg58 namespace cwg59 { // cwg59: yes #pragma clang diagnostic push @@ -988,14 +988,14 @@ namespace cwg59 { // cwg59: yes int n4 = convert_to(); int n5 = convert_to(); #pragma clang diagnostic pop -} +} // namespace cwg59 namespace cwg60 { // cwg60: yes void f(int &); int &f(...); const int k = 0; int &n = f(k); -} +} // namespace cwg60 namespace cwg61 { // cwg61: 3.4 struct X { @@ -1012,7 +1012,7 @@ namespace cwg61 { // cwg61: 3.4 // expected-error@-1 {{cannot create a non-constant pointer to member function}} void (*r)() = y.f; // expected-error@-1 {{cannot create a non-constant pointer to member function}} -} +} // namespace cwg61 namespace cwg62 { // cwg62: 2.9 struct A { @@ -1065,13 +1065,13 @@ namespace cwg62 { // cwg62: 2.9 X d; // cxx98-error@-1 {{template argument uses local type }} } -} +} // namespace cwg62 namespace cwg63 { // cwg63: yes template struct S { typename T::error e; }; extern S *p; void *q = p; -} +} // namespace cwg63 namespace cwg64 { // cwg64: yes template void f(T); @@ -1079,7 +1079,7 @@ namespace cwg64 { // cwg64: yes template<> void f(int*); template<> void f(int*); template<> void f(int); -} +} // namespace cwg64 // cwg65: na @@ -1100,7 +1100,7 @@ namespace cwg66 { // cwg66: no int c = f(1, 2); // expected-error@-1 {{no matching function for call to 'f'}} // expected-note@#cwg66-f-first {{candidate function not viable: requires single argument 'n', but 2 arguments were provided}} -} +} // namespace cwg66 // cwg67: na @@ -1121,7 +1121,7 @@ namespace cwg68 { // cwg68: 2.8 friend typename ::cwg68::X; // cxx98-error@-1 {{unelaborated friend declaration is a C++11 extension; specify 'struct' to befriend 'typename ::cwg68::X'}} }; -} +} // namespace cwg68 namespace cwg69 { // cwg69: 9 template static void f() {} // #cwg69-f @@ -1134,59 +1134,59 @@ namespace cwg69 { // cwg69: 9 Q<&f > q; // cxx98-error@-1 {{non-type template argument referring to function 'f' with internal linkage is a C++11 extension}} // cxx98-note@#cwg69-f {{non-type template argument refers to function here}} -} +} // namespace cwg69 namespace cwg70 { // cwg70: yes template struct A {}; template int f(int (&)[I + J], A, A); int arr[7]; int k = f(arr, A<3>(), A<4>()); -} +} // namespace cwg70 // cwg71: na // cwg72: dup 69 -#if __cplusplus >= 201103L namespace cwg73 { // cwg73: sup 1652 +#if __cplusplus >= 201103L int a, b; static_assert(&a + 1 != &b, ""); // expected-error@-1 {{static assertion expression is not an integral constant expression}} // expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}} -} #endif +} // namespace cwg73 namespace cwg74 { // cwg74: yes enum E { k = 5 }; int (*p)[k] = new int[k][k]; -} +} // namespace cwg74 namespace cwg75 { // cwg75: yes struct S { static int n = 0; // expected-error@-1 {{non-const static data member must be initialized out of line}} }; -} +} // namespace cwg75 namespace cwg76 { // cwg76: yes const volatile int n = 1; static_assert(n, ""); // expected-error@-1 {{static assertion expression is not an integral constant expression}} // expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}} -} +} // namespace cwg76 namespace cwg77 { // cwg77: yes struct A { struct B {}; friend struct B; }; -} +} // namespace cwg77 namespace cwg78 { // cwg78: sup ???? // Under CWG78, this is valid, because 'k' has static storage duration, so is // zero-initialized. const int k; // expected-error@-1 {{default initialization of an object of const type 'const int'}} -} +} // namespace cwg78 // cwg79: na @@ -1208,7 +1208,7 @@ namespace cwg80 { // cwg80: 2.9 int D; // expected-error@-1 {{member 'D' has the same name as its class}} }; -} +} // namespace cwg80 // cwg81: na // cwg82: dup 48 @@ -1217,7 +1217,7 @@ namespace cwg83 { // cwg83: yes int &f(const char*); char &f(char *); int &k = f("foo"); -} +} // namespace cwg83 namespace cwg84 { // cwg84: yes struct B; @@ -1235,7 +1235,7 @@ namespace cwg84 { // cwg84: yes // cxx98-14-error@-1 {{no viable constructor copying variable of type 'B'}} // cxx98-14-note@#cwg84-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}} // cxx98-14-note@#cwg84-ctor-from-C {{candidate constructor not viable: no known conversion from 'B' to 'C' for 1st argument}} -} +} // namespace cwg84 namespace cwg85 { // cwg85: 3.4 struct A { @@ -1273,7 +1273,7 @@ namespace cwg85 { // cwg85: 3.4 // expected-error@-1 {{class member cannot be redeclared}} // expected-note@#cwg85-C-B-def {{previous declaration is here}} }; -} +} // namespace cwg85 // cwg86: dup 446 @@ -1284,7 +1284,7 @@ namespace cwg87 { // cwg87: no X x; // This is valid under cwg87 but not under cwg1975. X y; -} +} // namespace cwg87 namespace cwg88 { // cwg88: 2.8 template struct S { @@ -1295,7 +1295,7 @@ namespace cwg88 { // cwg88: 2.8 // expected-error@-1 {{static data member 'a' already has an initializer}} // expected-note@#cwg88-a {{previous initialization is here}} template<> const int S::b = 4; -} +} // namespace cwg88 // cwg89: na @@ -1331,12 +1331,12 @@ namespace cwg90 { // cwg90: yes cwg90_g(F()); // expected-error@-1 {{use of undeclared identifier 'cwg90_g'}} } -} +} // namespace cwg90 namespace cwg91 { // cwg91: yes union U { friend int f(U); }; int k = f(U()); -} +} // namespace cwg91 namespace cwg92 { // cwg92: 4 c++17 void f() throw(int, float); @@ -1379,14 +1379,14 @@ namespace cwg92 { // cwg92: 4 c++17 // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} Y<&h> yp; // ok -} +} // namespace cwg92 // cwg93: na namespace cwg94 { // cwg94: yes struct A { static const int n = 5; }; int arr[A::n]; -} +} // namespace cwg94 namespace cwg95 { // cwg95: 3.3 struct A; @@ -1403,7 +1403,7 @@ namespace cwg95 { // cwg95: 3.3 struct B { void f() { N::C::f(); } }; // expected-error@-1 {{'f' is a private member of 'cwg95::N::C'}} // expected-note@#cwg95-C-f {{implicitly declared private here}} -} +} // namespace cwg95 namespace cwg96 { // cwg96: sup P1787 struct A { @@ -1424,14 +1424,14 @@ namespace cwg96 { // cwg96: sup P1787 A::template S s; B b; } -} +} // namespace cwg96 namespace cwg97 { // cwg97: yes struct A { static const int a = false; static const int b = !a; }; -} +} // namespace cwg97 namespace cwg98 { // cwg98: yes void test(int n) { @@ -1459,11 +1459,11 @@ namespace cwg98 { // cwg98: yes // expected-note@#cwg98-catch {{jump bypasses initialization of catch block}} } } -} +} // namespace cwg98 namespace cwg99 { // cwg99: sup 214 template void f(T&); template int &f(const T&); const int n = 0; int &r = f(n); -} +} // namespace cwg99 diff --git a/clang/test/CXX/drs/cwg10xx.cpp b/clang/test/CXX/drs/cwg10xx.cpp index 01de13238a6ae..c5b96c4ab8ffc 100644 --- a/clang/test/CXX/drs/cwg10xx.cpp +++ b/clang/test/CXX/drs/cwg10xx.cpp @@ -13,7 +13,7 @@ namespace std { const T *p; size_t n; initializer_list(const T *p, size_t n); }; -} +} // namespace std namespace cwg1004 { // cwg1004: 5 template struct A {}; @@ -44,7 +44,7 @@ namespace cwg1004 { // cwg1004: 5 // expected-error@-1 {{is a constructor name}} // expected-note@#cwg1004-t {{in instantiation of default argument}} Third > t; // #cwg1004-t -} +} // namespace cwg1004 namespace cwg1042 { // cwg1042: 3.5 #if __cplusplus >= 201402L @@ -59,7 +59,7 @@ namespace cwg1042 { // cwg1042: 3.5 // list in this mode. using foo [[]] = int; #endif -} +} // namespace cwg1042 namespace cwg1048 { // cwg1048: 3.6 struct A {}; @@ -77,7 +77,7 @@ namespace cwg1048 { // cwg1048: 3.6 } } (0); #endif -} +} // namespace cwg1048 namespace cwg1054 { // cwg1054: no // FIXME: Test is incomplete. @@ -90,7 +90,7 @@ namespace cwg1054 { // cwg1054: no a; // expected-warning@-1 {{expression result unused; assign into a variable to force a volatile load}} } -} +} // namespace cwg1054 namespace cwg1070 { // cwg1070: 3.5 #if __cplusplus >= 201103L @@ -109,4 +109,4 @@ namespace cwg1070 { // cwg1070: 3.5 }; C c = {}; #endif -} +} // namespace cwg1070 diff --git a/clang/test/CXX/drs/cwg11xx.cpp b/clang/test/CXX/drs/cwg11xx.cpp index dc024caa5075b..03612b6d87645 100644 --- a/clang/test/CXX/drs/cwg11xx.cpp +++ b/clang/test/CXX/drs/cwg11xx.cpp @@ -128,6 +128,6 @@ namespace cwg1113 { // cwg1113: partial extern "C" void f(); } void g() { f(); } -} +} // namespace cwg1113 // cwg1150: na diff --git a/clang/test/CXX/drs/cwg12xx.cpp b/clang/test/CXX/drs/cwg12xx.cpp index 1011afa5905e7..344adb6d72023 100644 --- a/clang/test/CXX/drs/cwg12xx.cpp +++ b/clang/test/CXX/drs/cwg12xx.cpp @@ -30,10 +30,10 @@ namespace cwg1213 { // cwg1213: 7 using U = decltype(V4Int()[0]); using U = decltype(EV4Int()[0]); #endif -} +} // namespace cwg1213 -#if __cplusplus >= 201103L namespace cwg1223 { // cwg1223: 17 +#if __cplusplus >= 201103L struct M; template struct V; @@ -83,12 +83,11 @@ void g() { sizeof(auto () -> C[1]); // since-cxx11-error@-1 {{function cannot return array type 'C[1]' (aka 'cwg1223::BB[1]')}} } - -} #endif +} // namespace cwg1223 -#if __cplusplus >= 201103L namespace cwg1227 { // cwg1227: 3.0 +#if __cplusplus >= 201103L template struct A { using X = typename T::X; }; // since-cxx11-error@-1 {{type 'int' cannot be used prior to '::' because it has no members}} // since-cxx11-note@#cwg1227-g {{in instantiation of template class 'cwg1227::A' requested here}} @@ -102,8 +101,8 @@ void h() { f(0); // OK, substituting return type causes deduction to fail g(0); // #cwg1227-g-int } -} #endif +} // namespace cwg1227 namespace cwg1250 { // cwg1250: 3.9 struct Incomplete; @@ -115,7 +114,7 @@ struct Base { struct Derived : Base { virtual Incomplete *meow(); }; -} +} // namespace cwg1250 namespace cwg1265 { // cwg1265: 5 #if __cplusplus >= 201103L @@ -135,7 +134,7 @@ namespace cwg1265 { // cwg1265: 5 auto k(), l(); // since-cxx14-error@-1 {{function with deduced return type must be the only declaration in its group}} #endif -} +} // namespace cwg1265 // cwg1291: na @@ -162,5 +161,4 @@ namespace cwg1295 { // cwg1295: 4 using T = decltype(true ? other : x.bitfield); using T = unsigned; #endif -} - +} // namespace cwg1295 diff --git a/clang/test/CXX/drs/cwg13xx.cpp b/clang/test/CXX/drs/cwg13xx.cpp index 980fcb4eb18ac..9c72fefb5b65c 100644 --- a/clang/test/CXX/drs/cwg13xx.cpp +++ b/clang/test/CXX/drs/cwg13xx.cpp @@ -14,10 +14,10 @@ namespace std { size_t n; initializer_list(const T*, size_t); }; -} +} // namespace std -#if __cplusplus >= 201103L namespace cwg1305 { // cwg1305: 3.0 +#if __cplusplus >= 201103L struct Incomplete; // #cwg1305-Incomplete struct Complete {}; @@ -25,8 +25,8 @@ int incomplete = alignof(Incomplete(&)[]); // since-cxx11-error@-1 {{invalid application of 'alignof' to an incomplete type 'Incomplete'}} // since-cxx11-note@#cwg1305-Incomplete {{forward declaration of 'cwg1305::Incomplete'}} int complete = alignof(Complete(&)[]); -} #endif +} // namespace cwg1305 namespace cwg1307 { // cwg1307: 14 #if __cplusplus >= 201103L @@ -150,7 +150,7 @@ namespace cwg1310 { // cwg1310: 5 } template void wt_test >(); // #cwg1310-W-int template void wt_test_good >(); -} +} // namespace cwg1310 namespace cwg1315 { // cwg1315: partial template struct A {}; @@ -182,7 +182,7 @@ namespace cwg1315 { // cwg1315: partial template struct C; template struct C; // expected-error@-1 {{type of specialized non-type template argument depends on a template parameter of the partial specialization}} -} +} // namespace cwg1315 namespace cwg1330 { // cwg1330: 4 c++11 // exception-specifications are parsed in a context where the class is complete. @@ -302,7 +302,7 @@ namespace cwg1330 { // cwg1330: 4 c++11 struct E : C {}; // #cwg1330-C-int E e; // #cwg1330-e -} +} // namespace cwg1330 // cwg1334: sup 1719 @@ -333,7 +333,7 @@ struct S { int z : 1 || new int { 0 }; }; #endif -} +} // namespace cwg1341 namespace cwg1346 { // cwg1346: 3.5 auto a(1); @@ -376,7 +376,7 @@ namespace cwg1346 { // cwg1346: 3.5 // since-cxx11-error@-2 {{cannot deduce type for lambda capture 'e' from parenthesized initializer list}} } #endif -} +} // namespace cwg1346 namespace cwg1347 { // cwg1347: 3.1 auto x = 5, *y = &x; @@ -390,7 +390,7 @@ namespace cwg1347 { // cwg1347: 3.1 auto (*fp)(int) -> int, i = 0; // since-cxx11-error@-1 {{declaration with trailing return type must be the only declaration in its group}} #endif -} +} // namespace cwg1347 namespace cwg1350 { // cwg1350: 3.5 #if __cplusplus >= 201103L @@ -520,7 +520,7 @@ namespace cwg1358 { // cwg1358: 3.1 // cxx11-20-note@#cwg1358-NonLit {{'NonLit' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}} }; #endif -} +} // namespace cwg1358 namespace cwg1359 { // cwg1359: 3.5 #if __cplusplus >= 201103L @@ -549,7 +549,7 @@ namespace cwg1359 { // cwg1359: 3.5 // cxx11-17-note@#cwg1359-Y {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} // cxx11-17-note@#cwg1359-Y {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} #endif -} +} // namespace cwg1359 namespace cwg1388 { // cwg1388: 4 template void f(T..., A); // #cwg1388-f @@ -622,7 +622,7 @@ namespace cwg1388 { // cwg1388: 4 // expected-error@-1 {{no matching function for call to 'f_pair_4'}} // expected-note@#cwg1388-f-4 {{candidate template ignored: deduced packs of different lengths for parameter 'T' ( vs. )}} } -} +} // namespace cwg1388 namespace cwg1391 { // cwg1391: partial struct A {}; struct B : A {}; @@ -713,14 +713,14 @@ namespace cwg1391 { // cwg1391: partial int test_c1 = c(0); // ok int test_c2 = c(0); // FIXME: apparently ambiguous } -} +} // namespace cwg1391 namespace cwg1394 { // cwg1394: 15 #if __cplusplus >= 201103L struct Incomplete; Incomplete f(Incomplete) = delete; // well-formed #endif -} +} // namespace cwg1394 namespace cwg1395 { // cwg1395: 16 #if __cplusplus >= 201103L @@ -731,7 +731,7 @@ namespace cwg1395 { // cwg1395: 16 f(&i); } #endif -} +} // namespace cwg1395 namespace cwg1397 { // cwg1397: 3.2 #if __cplusplus >= 201103L @@ -757,4 +757,4 @@ namespace cwg1399 { // cwg1399: dup 1388 // expected-error@-1 {{no matching function for call to 'f'}} // expected-note@#cwg1399-f {{candidate template ignored: deduced packs of different lengths for parameter 'T' (<> vs. )}} } -} +} // namespace cwg1399 diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp index cb2f34bf5e427..149468eb292c9 100644 --- a/clang/test/CXX/drs/cwg14xx.cpp +++ b/clang/test/CXX/drs/cwg14xx.cpp @@ -40,7 +40,7 @@ namespace cwg1413 { // cwg1413: 12 // expected-note@#cwg1413-var2 {{'var2' declared here}} } }; -} +} // namespace cwg1413 namespace cwg1423 { // cwg1423: 11 #if __cplusplus >= 201103L @@ -53,7 +53,7 @@ namespace cwg1423 { // cwg1423: 11 bool b4{nullptr}; // since-cxx11-warning@-1 {{implicit conversion of nullptr constant to 'bool'}} #endif -} +} // namespace 1423 // cwg1425: na abi @@ -76,7 +76,7 @@ namespace cwg1432 { // cwg1432: 16 template struct common_type; #endif -} +} // namespace cwg1432 namespace cwg1443 { // cwg1443: yes struct A { @@ -84,7 +84,7 @@ struct A { A() { void foo(int=i); } // expected-error@-1 {{default argument references 'this'}} }; -} +} // namespace cwg1443 namespace cwg1458 { // cwg1458: 3.1 #if __cplusplus >= 201103L @@ -357,7 +357,7 @@ namespace cwg1460 { // cwg1460: 3.5 static_assert(d.a == 0, ""); } #endif -} +} // namespace cwg1460 #if __cplusplus >= 201103L namespace std { @@ -388,7 +388,7 @@ namespace std { const _E* begin() const {return __begin_;} const _E* end() const {return __begin_ + __size_;} }; -} // std +} // namespace std #endif namespace cwg1467 { // cwg1467: 3.7 c++11 @@ -601,7 +601,7 @@ namespace cwg1467 { // cwg1467: 3.7 c++11 } } // namespace StringLiterals #endif -} // cwg1467 +} // namespace cwg1467 namespace cwg1477 { // cwg1477: 2.7 namespace N { @@ -630,7 +630,7 @@ namespace cwg1479 { // cwg1479: 3.1 int operator""_a(const char*, std::size_t = 0); // since-cxx11-error@-1 {{literal operator cannot have a default argument}} #endif -} +} // namespace cwg1479 namespace cwg1482 { // cwg1482: 3.0 // NB: sup 2516, test reused there @@ -675,7 +675,7 @@ namespace cwg1490 { // cwg1490: 3.7 c++11 std::initializer_list{"abc"}; // since-cxx11-error@-1 {{expected unqualified-id}}} #endif -} // cwg1490 +} // namespace cwg1490 namespace cwg1495 { // cwg1495: 4 #if __cplusplus >= 201103L @@ -717,7 +717,7 @@ namespace cwg1495 { // cwg1495: 4 // since-cxx14-note@#cwg1495-c {{template is declared here}} #endif #endif -} +} // namespace cwg1495 namespace cwg1496 { // cwg1496: no #if __cplusplus >= 201103L @@ -728,4 +728,4 @@ struct A { // default constructor which is not deleted. static_assert(__is_trivial(A), ""); #endif -} +} // namespace cwg1496 diff --git a/clang/test/CXX/drs/cwg15xx.cpp b/clang/test/CXX/drs/cwg15xx.cpp index 961c25000111a..d10890ee3fd15 100644 --- a/clang/test/CXX/drs/cwg15xx.cpp +++ b/clang/test/CXX/drs/cwg15xx.cpp @@ -170,7 +170,7 @@ namespace cwg1512 { // cwg1512: 4 // since-cxx11-note@#cwg1512-Wrap {{second operand was implicitly converted to type 'int *'}} } #endif -} +} // namespace cwg1512 namespace cwg1514 { // cwg1514: 11 #if __cplusplus >= 201103L @@ -184,7 +184,7 @@ namespace cwg1514 { // cwg1514: 11 // The behavior in other contexts is superseded by CWG1966. #endif -} +} // namespace cwg1514 namespace cwg1518 { // cwg1518: 4 #if __cplusplus >= 201103L @@ -321,13 +321,13 @@ namespace std_example { } } #endif // __cplusplus >= 201103L -} +} // namespace cwg1518 namespace cwg1550 { // cwg1550: 3.4 int f(bool b, int n) { return (b ? (throw 0) : n) + (b ? n : (throw 0)); } -} +} // namespace cwg1550 namespace cwg1558 { // cwg1558: 12 #if __cplusplus >= 201103L @@ -344,7 +344,7 @@ namespace cwg1558 { // cwg1558: 12 // since-cxx11-note@#cwg1558-f {{candidate template ignored: substitution failure [with T = int]: type 'int' cannot be used prior to '::' because it has no members}} } #endif -} +} // namespace cwg1558 namespace cwg1560 { // cwg1560: 3.5 void f(bool b, int n) { @@ -353,7 +353,7 @@ namespace cwg1560 { // cwg1560: 3.5 class X { X(const X&); }; const X &get(); const X &x = true ? get() : throw 0; -} +} // namespace cwg1560 namespace cwg1563 { // cwg1563: yes #if __cplusplus >= 201103L @@ -363,7 +363,7 @@ namespace cwg1563 { // cwg1563: yes using fun = double(double); fun &foo{bar}; // ok #endif -} +} // namespace cwg1563 namespace cwg1567 { // cwg1567: 3.3 #if __cplusplus >= 201103L @@ -402,7 +402,7 @@ B b5{A{0}}; // since-cxx11-note@#cwg1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}} // since-cxx11-note@#cwg1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}} #endif -} +} // namespace cwg1567 namespace cwg1573 { // cwg1573: 3.9 #if __cplusplus >= 201103L @@ -445,7 +445,7 @@ namespace cwg1573 { // cwg1573: 3.9 // since-cxx11-error@-1 {{call to deleted constructor of 'J'}} // since-cxx11-note@#cwg1573-I {{'I' has been explicitly marked deleted here}} #endif -} +} // namespace cwg1573 #if __cplusplus >= 201103L namespace std { @@ -485,7 +485,7 @@ namespace std { }; typedef basic_string string; -} // std +} // namespace std #endif namespace cwg1579 { // cwg1579: 3.9 @@ -558,7 +558,7 @@ auto CWG1579_lambda_invalid = []() -> GenericMoveOnly { // since-cxx11-note@#cwg1579-deleted-U {{'GenericMoveOnly' has been explicitly marked deleted here}} }; #endif -} // end namespace cwg1579 +} // namespace cwg1579 namespace cwg1584 { // cwg1584: 7 drafting 2015-05 // Deducing function types from cv-qualified types @@ -633,7 +633,7 @@ namespace cwg1589 { // cwg1589: 3.7 c++11 // since-cxx11-note@#cwg1589-f2-ilist-int {{candidate function}} } #endif -} // cwg1589 +} // namespace cwg1589 namespace cwg1591 { //cwg1591. Deducing array bound and element type from initializer list #if __cplusplus >= 201103L @@ -718,4 +718,4 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi short *ps = i(Arr{1, 2}); // OK #5 } #endif -} // cwg1591 +} // namespace cwg1591 diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp index 95e241f0d03e9..bd2c484344ddf 100644 --- a/clang/test/CXX/drs/cwg16xx.cpp +++ b/clang/test/CXX/drs/cwg16xx.cpp @@ -51,7 +51,7 @@ namespace cwg1611 { // cwg1611: dup 1658 struct B : virtual A { virtual void f() = 0; }; struct C : B { C() : A(0) {} void f(); }; C c; -} +} // namespace cwg1611 namespace cwg1631 { // cwg1631: 3.7 #if __cplusplus >= 201103L @@ -81,7 +81,7 @@ namespace cwg1631 { // cwg1631: 3.7 } } #endif -} +} // namespace cwg1631 namespace cwg1638 { // cwg1638: 3.1 #if __cplusplus >= 201103L @@ -122,7 +122,7 @@ namespace cwg1638 { // cwg1638: 3.1 // since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}} }; #endif -} +} // namespace cwg1638 namespace cwg1645 { // cwg1645: 3.9 #if __cplusplus >= 201103L @@ -149,14 +149,14 @@ namespace cwg1645 { // cwg1645: 3.9 // since-cxx11-note@#cwg1645-int-int-int {{candidate inherited constructor has been explicitly deleted}} // since-cxx11-note@#cwg1645-using {{constructor from base class 'A' inherited here}} #endif -} +} // namespace cwg1645 namespace cwg1652 { // cwg1652: 3.6 int a, b; static_assert(&a + 1 == &b, ""); // expected-error@-1 {{static assertion expression is not an integral constant expression}} // expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}} -} +} // namespace cwg1652 namespace cwg1653 { // cwg1653: 4 c++17 void f(bool b) { @@ -173,7 +173,7 @@ namespace cwg1653 { // cwg1653: 4 c++17 b += 1; // ok b -= 1; // ok } -} +} // namespace cwg1653 namespace cwg1658 { // cwg1658: 5 namespace DefCtor { @@ -324,7 +324,7 @@ namespace cwg1658 { // cwg1658: 5 } // assignment case is superseded by cwg2180 -} +} // namespace cwg1658 namespace cwg1672 { // cwg1672: 7 struct Empty {}; @@ -349,7 +349,7 @@ namespace cwg1672 { // cwg1672: 7 static_assert(!__is_standard_layout(Y), ""); static_assert(!__is_standard_layout(Y), ""); static_assert(!__is_standard_layout(Y), ""); -} +} // namespace cwg1672 namespace cwg1684 { // cwg1684: 3.6 #if __cplusplus >= 201103L @@ -363,7 +363,7 @@ namespace cwg1684 { // cwg1684: 3.6 // cxx11-20-error@-1 {{constexpr function's 1st parameter type 'NonLiteral' is not a literal type}} // cxx11-20-note@#cwg1684-struct {{'NonLiteral' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}} #endif -} +} // namespace cwg1684 namespace cwg1687 { // cwg1687: 7 template struct To { @@ -386,7 +386,7 @@ namespace cwg1687 { // cwg1687: 7 // since-cxx20-error@-1 {{invalid operands to binary expression ('To' and 'To')}} // since-cxx20-note@#cwg1687-op-T {{operand was implicitly converted to type 'cwg1687::E}} #endif -} +} // namespace cwg1687 namespace cwg1690 { // cwg1690: 9 // See also the various tests in "CXX/basic/basic.lookup/basic.lookup.argdep". @@ -401,7 +401,7 @@ namespace cwg1690 { // cwg1690: 9 f(s); // ok } #endif -} +} // namespace cwg1690 namespace cwg1691 { // cwg1691: 9 #if __cplusplus >= 201103L @@ -421,7 +421,7 @@ namespace cwg1691 { // cwg1691: 9 // since-cxx11-note@#cwg1691-g {{'N::g' declared here}} } #endif -} +} // namespace cwg1691 namespace cwg1692 { // cwg1692: 9 namespace N { @@ -436,7 +436,7 @@ namespace cwg1692 { // cwg1692: 9 N::A::B::C c; f(c); // ok } -} +} // namespace cwg1692 namespace cwg1696 { // cwg1696: 7 namespace std_examples { @@ -554,4 +554,4 @@ namespace cwg1696 { // cwg1696: 7 // since-cxx11-note@#cwg1696-il-5 {{nitializing field 'il' with default member initializer}} }; #endif -} +} // namespace cwg1696 diff --git a/clang/test/CXX/drs/cwg177x.cpp b/clang/test/CXX/drs/cwg177x.cpp index 72a12c2f92c20..a17fd221b51f0 100644 --- a/clang/test/CXX/drs/cwg177x.cpp +++ b/clang/test/CXX/drs/cwg177x.cpp @@ -31,7 +31,7 @@ namespace cwg1772 { // cwg1772: 14 // CXX11-NEXT: StringLiteral{{.+}} 'const char[11]' lvalue "operator()" } #endif // __cplusplus >= 201103L -} +} // namespace cwg1772 namespace cwg1779 { // cwg1779: 14 // __func__ in a function template, member function template, or generic @@ -79,4 +79,4 @@ namespace cwg1779 { // cwg1779: 14 }; } #endif // __cplusplus >= 201402L -} +} // namespace cwg1779 diff --git a/clang/test/CXX/drs/cwg17xx.cpp b/clang/test/CXX/drs/cwg17xx.cpp index fb53a56923b10..04bf637543a29 100644 --- a/clang/test/CXX/drs/cwg17xx.cpp +++ b/clang/test/CXX/drs/cwg17xx.cpp @@ -44,7 +44,7 @@ namespace cwg1715 { // cwg1715: 3.9 // since-cxx11-note@#cwg1715-E {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} // since-cxx11-note@#cwg1715-E {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} #endif -} +} // namespace cwg1715 namespace cwg1719 { // cwg1719: 19 #if __cplusplus >= 201103L @@ -96,7 +96,7 @@ struct A { // operator, or move assignment operator. static_assert(__is_trivially_copyable(A), ""); #endif -} +} // namespace cwg1734 namespace cwg1736 { // cwg1736: 3.9 #if __cplusplus >= 201103L @@ -115,7 +115,7 @@ struct S { struct Q { typedef int type; } q; S s(q); // #cwg1736-s #endif -} +} // namespace cwg1736 namespace cwg1738 { // cwg1738: sup P0136R1 #if __cplusplus >= 201103L @@ -132,7 +132,7 @@ struct B : A { template B::B(int, double); // since-cxx11-error@-1 {{explicit instantiation of 'B' does not refer to a function template, variable template, member function, member class, or static data member}} #endif -} +} // namespace cwg1738 // cwg1748 is in cwg1748.cpp @@ -165,7 +165,7 @@ namespace cwg1753 { // cwg1753: 11 n.~decltype(n)(); // OK #endif } -} +} // namespace cwg1753 namespace cwg1756 { // cwg1756: 3.7 #if __cplusplus >= 201103L @@ -176,7 +176,7 @@ namespace cwg1756 { // cwg1756: 3.7 struct X { operator int(); } x; int b{x}; #endif -} +} // namespace cwg1756 namespace cwg1758 { // cwg1758: 3.7 #if __cplusplus >= 201103L @@ -195,7 +195,7 @@ namespace cwg1758 { // cwg1758: 3.7 } b; A a{b}; #endif -} +} // namespace cwg1758 namespace cwg1762 { // cwg1762: 14 #if __cplusplus >= 201103L @@ -204,7 +204,7 @@ namespace cwg1762 { // cwg1762: 14 // since-cxx11-error@-1 {{invalid suffix on literal; C++11 requires a space between literal and identifier}} // since-cxx11-warning@-2 {{user-defined literal suffixes not starting with '_' are reserved; no literal will invoke this operator}} #endif -} +} // namespace cwg1762 // cwg1772 is in cwg177x.cpp @@ -221,7 +221,7 @@ namespace cwg1778 { // cwg1778: 9 static_assert(!noexcept(C()), ""); static_assert(noexcept(D()), ""); #endif -} +} // namespace cwg1778 // cwg1779 is in cwg177x.cpp diff --git a/clang/test/CXX/drs/cwg1807.cpp b/clang/test/CXX/drs/cwg1807.cpp index 59edacc49658c..a2c4968ef7735 100644 --- a/clang/test/CXX/drs/cwg1807.cpp +++ b/clang/test/CXX/drs/cwg1807.cpp @@ -15,7 +15,7 @@ struct S { void f() { S s[3]; } -} +} // namespace cwg1807 // CHECK-LABEL: define dso_local void @cwg1807::f() // CHECK: invoke void @cwg1807::S::S(){{.+}} diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp index 0fd2cd6b2d870..4beeb41ac3728 100644 --- a/clang/test/CXX/drs/cwg18xx.cpp +++ b/clang/test/CXX/drs/cwg18xx.cpp @@ -26,7 +26,7 @@ S V; // #cwg1801-S-i // cxx98-14-error@-1 {{non-type template argument does not refer to any declaration}} // cxx98-14-note@#cwg1801-S {{template parameter is declared here}} // cxx17-error@#cwg1801-S-i {{non-type template argument refers to subobject '.i'}} -} +} // namespace cwg1801 namespace cwg1802 { // cwg1802: 3.1 #if __cplusplus >= 201103L @@ -204,7 +204,7 @@ namespace cwg1814 { // cwg1814: yes auto lam = [](int x = 42) { return x; }; } #endif -} +} // namespace cwg1814 namespace cwg1815 { // cwg1815: 20 #if __cplusplus >= 201402L @@ -229,7 +229,7 @@ namespace cwg1815 { // cwg1815: 20 static_assert(f() == 0); #endif #endif -} +} // namespace cwg1815 // cwg1818 is in cwg1818.cpp @@ -303,7 +303,7 @@ namespace cwg1822 { // cwg1822: yes static_assert(__is_same(decltype(a), int), "should be resolved to lambda parameter"); }; #endif -} +} // namespace cwg1822 namespace cwg1824 { // cwg1824: 2.7 template @@ -373,7 +373,7 @@ namespace cwg1837 { // cwg1837: 3.3 }; }; #endif -} +} // namespace cwg1837 namespace cwg1862 { // cwg1862: no template @@ -498,7 +498,7 @@ namespace cwg1872 { // cwg1872: 9 // since-cxx23-note@-5 {{cannot construct object of type 'A' with virtual base class in a constant expression}} #endif #endif -} +} // namespace cwg1872 namespace cwg1878 { // cwg1878: 18 #if __cplusplus >= 201402L @@ -533,7 +533,7 @@ struct S { #endif }; #endif -} +} // namespace cwg1878 namespace cwg1881 { // cwg1881: 7 struct A { int a : 4; }; @@ -545,7 +545,7 @@ namespace cwg1881 { // cwg1881: 7 struct D : C { int : 0; }; static_assert(__is_standard_layout(C), ""); static_assert(!__is_standard_layout(D), ""); -} +} // namespace cwg1881 // cwg1884 is in cwg1884.cpp @@ -613,7 +613,7 @@ void cwg1891() { // cwg1891: 4 // since-cxx11-error-re@-1 {{{{object of type '\(lambda at .+\)' cannot be assigned because its copy assignment operator is implicitly deleted}}}} // since-cxx11-note@#cwg1891-b {{lambda expression begins here}} #endif -} +} // void cwg1891() namespace cwg1894 { // cwg1894: 3.8 // NB: reusing part of cwg407 test @@ -641,7 +641,7 @@ namespace H { using namespace A; struct S s; } -} +} // namespace cwg1894 namespace cwg1898 { // cwg1898: 2.7 void e(int) {} // #cwg1898-e diff --git a/clang/test/CXX/drs/cwg19xx.cpp b/clang/test/CXX/drs/cwg19xx.cpp index 2fe46909eaacb..a01082a440278 100644 --- a/clang/test/CXX/drs/cwg19xx.cpp +++ b/clang/test/CXX/drs/cwg19xx.cpp @@ -44,7 +44,7 @@ namespace cwg1902 { // cwg1902: 3.7 // since-cxx11-note@#cwg1902-B-A {{candidate constructor}} // since-cxx11-note@#cwg1902-B-copy-ctor {{candidate constructor has been explicitly deleted}} #endif -} +} // namespace cwg1902 namespace cwg1903 { // cwg1903: 2.7 namespace A { @@ -69,7 +69,7 @@ namespace cwg1903 { // cwg1903: 2.7 using A::d; struct a *p; } -} +} // namespace cwg1903 namespace cwg1909 { // cwg1909: 3.7 struct A { @@ -90,7 +90,7 @@ namespace cwg1909 { // cwg1909: 3.7 // cxx98-error@-1 {{alias declarations are a C++11 extension}} // expected-error@-2 {{member 'D' has the same name as its class}} }; -} +} // namespace cwg1909 namespace cwg1918 { // cwg1918: no template struct A { @@ -125,7 +125,7 @@ static union { int not_empty; }; #endif -} +} // namespace cwg1918 namespace cwg1941 { // cwg1941: 3.9 #if __cplusplus >= 201402L @@ -155,7 +155,7 @@ struct iter { derived d1(it, end); derived d2(42, 9); #endif -} +} // namespace cwg1941 namespace cwg1945 { // cwg1945: no template struct A { @@ -180,7 +180,7 @@ unsigned b = 0b'01; unsigned x = 0x'01; // since-cxx14-error@-1 {{invalid suffix 'x'01' on integer constant}} #endif -} +} // namespace cwg1947 #if __cplusplus >= 201103L // cwg1948: 3.5 @@ -230,7 +230,7 @@ namespace cwg1959 { // cwg1959: 3.9 // where the base class is reference-related to the argument type. c q(static_cast(q)); #endif -} +} // namespace cwg1959 namespace cwg1960 { // cwg1960: no struct A { @@ -251,7 +251,7 @@ struct C : B { using A::f; using A::g; }; -} +} // namespace cwg1960 namespace cwg1966 { // cwg1966: 11 #if __cplusplus >= 201103L @@ -280,7 +280,7 @@ namespace cwg1966 { // cwg1966: 11 // since-cxx11-error@-3 {{anonymous bit-field cannot have a default member initializer}} }; #endif -} +} // namespace cwg1966 namespace cwg1968 { // cwg1968: no #if __cplusplus >= 201103L @@ -291,7 +291,7 @@ namespace cwg1968 { // cwg1968: no constexpr const std::type_info *f() { return &typeid(int); } static_assert(f() == f(), ""); #endif -} +} // namespace cwg1968 namespace cwg1991 { // cwg1991: 3.9 #if __cplusplus >= 201103L @@ -309,6 +309,6 @@ namespace cwg1991 { // cwg1991: 3.9 // of ambiguity. B b(0, 0); // ok, calls B constructor #endif -} +} // namespace cwg1991 // cwg1994: dup 529 diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index cc1dd784b127d..39dfd310e93a3 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -38,7 +38,7 @@ namespace cwg100 { // cwg100: yes // cxx98-14-error@#cwg100-d {{non-type template argument does not refer to any declaration}} // cxx98-14-note@#cwg100-D {{template parameter is declared here}} // since-cxx17-error@#cwg100-d {{reference to subobject of string literal is not allowed in a template argument}} -} +} // namespace cwg100 namespace cwg101 { // cwg101: 3.5 extern "C" void cwg101_f(); @@ -51,7 +51,7 @@ namespace cwg101 { // cwg101: 3.5 using X::size_t; extern "C" void cwg101_f(); typedef unsigned size_t; -} +} // namespace cwg101 namespace cwg102 { // cwg102: yes namespace A { @@ -65,7 +65,7 @@ namespace cwg102 { // cwg102: yes } B::S operator+(B::S, B::S); // #cwg102-operator-plus template B::S A::f(B::S, B::S); // #cwg102-instantiation -} +} // namespace cwg102 // cwg103: na // cwg104: na lib @@ -85,12 +85,12 @@ namespace cwg106 { // cwg106: sup 540 // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}} typedef const r2 &r2; // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}} -} +} // namespace cwg106 namespace cwg107 { // cwg107: yes struct S {}; extern "C" S operator+(S, S) { return S(); } -} +} // namespace cwg107 namespace cwg108 { // cwg108: 2.9 template struct A { @@ -101,7 +101,7 @@ namespace cwg108 { // cwg108: 2.9 // expected-error@-1 {{unknown type name 'X'}} }; template<> struct A::B { int X; }; -} +} // namespace cwg108 namespace cwg109 { // cwg109: yes struct A { template void f(T); }; @@ -118,7 +118,7 @@ namespace cwg109 { // cwg109: yes void g() { this->f(123); } // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} }; -} +} // namespace cwg109 namespace cwg110 { // cwg110: 2.8 template @@ -142,7 +142,7 @@ namespace cwg111 { // cwg111: dup 535 // expected-error@-1 {{no matching constructor for initialization of 'B'}} // expected-note@#cwg111-B {{candidate constructor (the implicit copy constructor) not viable: 1st argument ('const B') would lose const qualifier}} // expected-note@#cwg111-B {{candidate constructor not viable: requires 0 arguments, but 1 was provided}} -} +} // namespace cwg111 namespace cwg112 { // cwg112: yes struct T { int n; }; @@ -162,7 +162,7 @@ namespace cwg112 { // cwg112: yes // cxx98-error@-1 {{non-type template argument referring to object 'a3' with internal linkage is a C++11 extension}} // cxx98-note@#cwg112-a3 {{non-type template argument refers to object here}} X x4; -} +} // namespace cwg112 namespace cwg113 { // cwg113: yes extern void (*p)(); @@ -173,7 +173,7 @@ namespace cwg113 { // cwg113: yes } void g(); void (*p)() = &g; -} +} // namespace cwg113 namespace cwg114 { // cwg114: yes struct A { @@ -185,7 +185,7 @@ namespace cwg114 { // cwg114: yes } b; // expected-error@-1 {{variable type 'struct B' is an abstract class}} // expected-note@#cwg114-A-f {{unimplemented pure virtual method 'f' in 'B'}} -} +} // namespace cwg114 namespace cwg115 { // cwg115: 3.0 template int f(T); // #cwg115-f @@ -294,7 +294,7 @@ namespace cwg115 { // cwg115: 3.0 // expected-warning@-1 {{expression result unused}} } #endif -} +} // namespace cwg115 namespace cwg116 { // cwg116: yes template struct A {}; @@ -306,7 +306,7 @@ namespace cwg116 { // cwg116: yes template void f(A) {} // expected-error@-1 {{redefinition of 'f'}} // expected-note@#cwg116-f-T {{previous definition is here}} -} +} // namespace cwg116 // cwg117: na // cwg118 is in cwg118.cpp @@ -324,12 +324,12 @@ namespace cwg121 { // cwg121: yes // cxx98-17-error@-2 {{missing 'typename' prior to dependent type name T::Y; implicit 'typename' is a C++20 extension}} }; Z z; -} +} // namespace cwg121 namespace cwg122 { // cwg122: yes template void f(); void g() { f(); } -} +} // namespace cwg122 // cwg123: na // cwg124 is in cwg124.cpp @@ -349,7 +349,7 @@ namespace cwg125 { // since-cxx11-error@#cwg125_C {{'cwg125_B' is missing exception specification 'noexcept'}} // since-cxx11-note@#cwg125_B {{previous declaration is here}} }; -} +} // namespace cwg125 namespace cwg126 { // cwg126: partial // FIXME: We do not yet generate correct code for this change: @@ -460,7 +460,7 @@ namespace cwg126 { // cwg126: partial void f() throw(int); // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} -} +} // namespace cwg126 namespace cwg127 { // cwg127: 2.9 __extension__ typedef __decltype(sizeof(0)) size_t; @@ -476,12 +476,12 @@ namespace cwg127 { // cwg127: 2.9 }; A *p = new A; // #cwg127-p A *q = new ("") A; // #cwg127-q -} +} // namespace cwg127 namespace cwg128 { // cwg128: yes enum E1 { e1 } x = e1; enum E2 { e2 } y = static_cast(x), z = static_cast(e1); -} +} // namespace cwg128 // cwg129: dup 616 // cwg130: na @@ -490,7 +490,7 @@ namespace cwg131 { // cwg131: sup P1949 const char *a_with_\u0e8c = "\u0e8c"; const char *b_with_\u0e8d = "\u0e8d"; const char *c_with_\u0e8e = "\u0e8e"; -} +} // namespace cwg131 namespace cwg132 { // cwg132: no void f() { @@ -498,7 +498,7 @@ namespace cwg132 { // cwg132: no extern struct S {} y; // FIXME: This is invalid. } static enum { E } e; -} +} // namespace cwg132 // cwg133: dup 87 // cwg134: na @@ -509,7 +509,7 @@ namespace cwg135 { // cwg135: yes friend A g(A a) { return a; } static A h(A a) { return a; } }; -} +} // namespace cwg135 namespace cwg136 { // cwg136: 3.4 void f(int, int, int = 0); // #cwg136-f @@ -557,7 +557,7 @@ namespace cwg136 { // cwg136: 3.4 // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}} // expected-note@#cwg136-B-f {{previous declaration is here}} }; -} +} // namespace cwg136 namespace cwg137 { // cwg137: yes extern void *p; @@ -581,7 +581,7 @@ namespace cwg137 { // cwg137: yes const volatile int *cvqc = static_cast(cp); const volatile int *cvqv = static_cast(vp); const volatile int *cvqcv = static_cast(cvp); -} +} // namespace cwg137 namespace cwg138 { // cwg138: partial namespace example1 { @@ -677,7 +677,7 @@ namespace cwg139 { // cwg139: yes }; } } -} +} // namespace cwg139 namespace cwg140 { // cwg140: yes void f(int *const) {} // #cwg140-f-first @@ -686,7 +686,7 @@ namespace cwg140 { // cwg140: yes // expected-note@#cwg140-f-first {{previous definition is here}} void g(const int); void g(int n) { n = 2; } -} +} // namespace cwg140 namespace cwg141 { // cwg141: 3.1 template void f(); @@ -728,7 +728,7 @@ namespace cwg141 { // cwg141: 3.1 template void S(); }; void i() { C().i(); } // ok!! -} +} // namespace cwg141 namespace cwg142 { // cwg142: 2.8 class B { // #cwg142-B @@ -781,7 +781,7 @@ namespace cwg142 { // cwg142: 2.8 cwg142::B *bp2 = (cwg142::B*)this; bp2->mi = 3; } -} +} // namespace cwg142 namespace cwg143 { // cwg143: yes namespace A { struct X; } @@ -793,7 +793,7 @@ namespace cwg143 { // cwg143: yes f(x); // expected-error@-1 {{use of undeclared identifier 'f'}} } -} +} // namespace cwg143 namespace cwg145 { // cwg145: yes void f(bool b) { @@ -804,7 +804,7 @@ namespace cwg145 { // cwg145: yes // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}} // since-cxx17-error@-2 {{ISO C++17 does not allow incrementing expression of type bool}} } -} +} // namespace cwg145 namespace cwg147 { // cwg147: yes namespace example1 { @@ -832,13 +832,13 @@ namespace cwg147 { // cwg147: yes template<> A::A(A::a); // expected-error@-1 {{qualified reference to 'A' is a constructor name rather than a template name in this context}} } -} +} // namespace cwg147 namespace cwg148 { // cwg148: yes struct A { int A::*p; }; static_assert(__is_pod(int(A::*)), ""); static_assert(__is_pod(A), ""); -} +} // namespace cwg148 // cwg149: na @@ -880,13 +880,13 @@ namespace cwg150 { // cwg150: 19 b + p; } } // namespace n1 -} +} // namespace cwg150 namespace cwg151 { // cwg151: 3.1 struct X {}; typedef int X::*p; static_assert(__enable_constant_folding(p() == 0), ""); -} +} // namespace cwg151 namespace cwg152 { // cwg152: yes struct A { @@ -905,7 +905,7 @@ namespace cwg152 { // cwg152: yes // expected-note@#cwg152-A-explicit-ctor {{explicit constructor is not a candidate}} // expected-note@#cwg152-A-ctor {{candidate constructor not viable: requires 0 arguments, but 1 was provided}} A a4(f()); -} +} // namespace cwg152 // cwg153: na @@ -916,12 +916,12 @@ namespace cwg154 { // cwg154: yes union { int b; }; } static union { int c; }; -} +} // namespace cwg154 namespace cwg155 { // cwg155: dup 632 struct S { int n; } s = { { 1 } }; // expected-warning@-1 {{braces around scalar initializer}} -} +} // namespace cwg155 // cwg156: sup 1111 // cwg158 is in cwg158.cpp @@ -932,7 +932,7 @@ namespace cwg159 { // cwg159: 3.5 void cwg159::f() {} // expected-warning@-1 {{extra qualification on member 'f'}} void cwg159::X::f() {} -} +} // namespace cwg159 // cwg160: na @@ -971,7 +971,7 @@ namespace cwg161 { // cwg161: 3.1 D::sf(); } }; -} +} // namespace cwg161 namespace cwg162 { // cwg162: 19 struct A { @@ -988,7 +988,7 @@ namespace cwg162 { // cwg162: 19 int &c = (&A::f)(0); char &d = (&A::f)('0'); // expected-error@-1 {{non-const lvalue reference to type 'char' cannot bind to a value of unrelated type 'int'}} -} +} // namespace cwg162 // cwg163: na @@ -1000,7 +1000,7 @@ namespace cwg164 { // cwg164: yes int f(E); int k = g(e); -} +} // namespace cwg164 namespace cwg165 { // cwg165: no namespace N { @@ -1011,7 +1011,7 @@ namespace cwg165 { // cwg165: no struct N::B {}; // FIXME: cwg165 says this is ill-formed, but the argument in cwg1477 says it's ok void N::g() {} -} +} // namespace cwg165 namespace cwg166 { // cwg166: 2.9 namespace A { class X; } @@ -1043,7 +1043,7 @@ namespace cwg166 { // cwg166: 2.9 int i(A::X x) { return x.n; } // expected-error@-1 {{'n' is a private member of 'cwg166::A::X'}} // expected-note@#cwg166-X-n {{implicitly declared private here}} -} +} // namespace cwg166 // cwg167: sup 1012 @@ -1055,7 +1055,7 @@ namespace cwg168 { // cwg168: no }; p a = &S::f; // FIXME: this should fail. q b = &S::f; -} +} // namespace cwg168 namespace cwg169 { // cwg169: yes template struct A { int n; }; @@ -1074,7 +1074,7 @@ namespace cwg169 { // cwg169: yes using B::n; // expected-error@-1 {{using declaration cannot refer to a template specialization}} }; -} +} // namespace cwg169 namespace { // cwg171: 3.4 int cwg171a; @@ -1085,7 +1085,7 @@ namespace cwg171 { extern "C" void cwg171b(); // expected-error@-1 {{declaration of 'cwg171b' with C language linkage conflicts with declaration in global scope}} // expected-note@#cwg171b-int {{declared in global scope here}} -} +} // namespace cwg171 namespace cwg172 { // cwg172: yes enum { zero }; @@ -1119,13 +1119,13 @@ namespace cwg172 { // cwg172: yes // cxx98-error@-1 {{'long long' is a C++11 extension}} static_assert(sizeof(f) == sizeof(unsigned long), ""); static_assert(-f > 0, ""); -} +} // namespace cwg172 namespace cwg173 { // cwg173: yes static_assert('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' && '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' && '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9', ""); -} +} // namespace cwg173 // cwg174: sup 1012 @@ -1139,7 +1139,7 @@ namespace cwg175 { // cwg175: 2.8 // expected-note@#cwg175-A {{member is declared here}} cwg175::A b; }; -} +} // namespace cwg175 namespace cwg176 { // cwg176: 3.1 template class Y; @@ -1179,7 +1179,7 @@ namespace cwg176 { // cwg176: 3.1 // since-cxx17-error@#cwg176-p4 {{use of class template 'cwg176::X' requires template arguments; argument deduction not allowed in non-static class member}} // since-cxx17-note@#cwg176-X {{template is declared here}} }; -} +} // namespace cwg176 namespace cwg177 { // cwg177: yes struct B {}; @@ -1200,7 +1200,7 @@ namespace cwg177 { // cwg177: yes C c = e; // expected-error@-1 {{no viable constructor copying variable of type 'D'}} // expected-note@#cwg177-C-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}} -} +} // namespace cwg177 namespace cwg178 { // cwg178: yes static_assert(int() == 0, ""); @@ -1213,13 +1213,13 @@ namespace cwg178 { // cwg178: yes struct U : S { constexpr U() : S() {} }; static_assert(U().b == 0, ""); #endif -} +} // namespace cwg178 namespace cwg179 { // cwg179: yes void f(); int n = &f - &f; // expected-error@-1 {{arithmetic on pointers to the function type 'void ()'}} -} +} // namespace cwg179 namespace cwg180 { // cwg180: 2.8 template struct X : T, T::some_base { @@ -1229,7 +1229,7 @@ namespace cwg180 { // cwg180: 2.8 enum T::some_enum e; } }; -} +} // namespace cwg180 namespace cwg181 { // cwg181: yes namespace X { @@ -1243,7 +1243,7 @@ namespace cwg181 { // cwg181: yes template